Name : Bhargav R Pandya

S_ID : 202218055

In [None]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


You should consider upgrading via the 'c:\program files\python37\python.exe -m pip install --upgrade pip' command.


In [None]:
# Import libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import regexp_extract
from pyspark.ml.feature import StandardScaler

In [None]:
# Create Sparksession and Read dataset into a PySpark dataframe
spark = SparkSession.builder.appName('Pyspark ML').getOrCreate()
carDf = spark.read.csv('Car details v3.csv', header=True, inferSchema=True)

In [None]:
carDf.show()

+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|                name|year|selling_price|km_driven|  fuel|seller_type|transmission|       owner|   mileage| engine| max_power|              torque|seats|
+--------------------+----+-------------+---------+------+-----------+------------+------------+----------+-------+----------+--------------------+-----+
|Maruti Swift Dzir...|2014|       450000|   145500|Diesel| Individual|      Manual| First Owner| 23.4 kmpl|1248 CC|    74 bhp|      190Nm@ 2000rpm|    5|
|Skoda Rapid 1.5 T...|2014|       370000|   120000|Diesel| Individual|      Manual|Second Owner|21.14 kmpl|1498 CC|103.52 bhp| 250Nm@ 1500-2500rpm|    5|
|Honda City 2017-2...|2006|       158000|   140000|Petrol| Individual|      Manual| Third Owner| 17.7 kmpl|1497 CC|    78 bhp|12.7@ 2,700(kgm@ ...|    5|
|Hyundai i20 Sport...|2010|       225000|   127000|Diesel| Individual|      

In [None]:
carDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- selling_price: integer (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- fuel: string (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- transmission: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- mileage: string (nullable = true)
 |-- engine: string (nullable = true)
 |-- max_power: string (nullable = true)
 |-- torque: string (nullable = true)
 |-- seats: integer (nullable = true)



In [None]:
# Select necessary columns
df = carDf.select(['name', 'year', 'selling_price', 'km_driven', 'fuel',
                      'seller_type', 'transmission', 'owner', 'mileage',
                      'engine', 'max_power'])
df = df.na.drop()

In [None]:
catCols = ['name', 'fuel', 'seller_type', 'transmission', 'owner']
for c in catCols:
    stringIndexer = StringIndexer(inputCol=c, outputCol=c+'_index')
    model = stringIndexer.fit(df)
    df = model.transform(df)

for c in ['mileage', 'engine', 'max_power']:
      df = df.withColumn(c, regexp_extract(c, r"(\d+(\.\d+)?)", 1).cast('float'))

In [None]:
assembler = VectorAssembler(inputCols=['name_index','year', 'km_driven', 'fuel_index', 
      'seller_type_index', 'transmission_index', 'owner_index',
       'mileage', 'engine', 'max_power'],
    outputCol='features', handleInvalid='skip')

df = assembler.transform(df)

In [None]:
# Split data into train and test (80:20)
train_data, test_data = df.randomSplit([0.8, 0.2], seed=40)

In [None]:
lr = LinearRegression(featuresCol='features', labelCol='selling_price')

lr_model = lr.fit(train_data)
# Make predictions on the test dataset
predictions = lr_model.transform(test_data)

# Evaluate the model using RMSE, MAE and R2 score
evaluator = RegressionEvaluator(labelCol="selling_price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol="selling_price", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol="selling_price",predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

# Print the evaluation metrics
print("RMSE: ", rmse)
print("MAE: ", mae)
print("R2 score: ", r2)

RMSE:  488249.94437891594
MAE:  280993.02795579494
R2 score:  0.6486724473232394


In [None]:
# Standardize the features
stand_scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
stand_scalerModel = stand_scaler.fit(df)
scaled_data = stand_scalerModel.transform(df)
scaled_data = scaled_data.select('scaledFeatures', 'selling_Price')

# Split data into training and testing datasets
train, test = scaled_data.randomSplit([0.8, 0.2], seed=42)

# Train the Linear Regression model
lr = LinearRegression(featuresCol='scaledFeatures', labelCol='selling_Price', maxIter=100000, regParam=0.0078, elasticNetParam=0.8)
model = lr.fit(train)

# Make predictions on the test dataset
predictions = model.transform(test)

# Evaluate the model using RMSE, MAE and R2 score
evaluator = RegressionEvaluator(labelCol="selling_Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol="selling_Price", predictionCol="prediction", metricName="mae")
mae = evaluator.evaluate(predictions)
evaluator = RegressionEvaluator(labelCol="selling_Price", predictionCol="prediction", metricName="r2")
r2 = evaluator.evaluate(predictions)

# Print the evaluation
print("RMSE: ", rmse)
print("MAE: ", mae)
print("R2 score: ", r2)

RMSE:  444780.995125806
MAE:  270217.6369276774
R2 score:  0.6924231215954223
