In [43]:
#To import all required modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import os
import pandas as pd
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *
from pyspark.ml.regression import LinearRegression



In [4]:
spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

In [35]:
# Loading the dataset
import_dataFrame = spark.read.load("dataset/import.csv", format="csv", header=True, inferSchema=True, delimiter=",")
# Check the type of import_dataFrame
print("The type of import_dataFrame is", type(import_dataFrame))
#To show the first 3 rows
pd.DataFrame(import_dataFrame.take(3), columns=import_dataFrame.columns).transpose()

The type of import_dataFrame is <class 'pyspark.sql.dataframe.DataFrame'>


Unnamed: 0,0,1,2
symboling,3,3,1
normalized_losses,?,?,?
make,alfa-romero,alfa-romero,alfa-romero
fuel_type,gas,gas,gas
aspiration,std,std,std
num_of_doors,two,two,two
body_style,convertible,convertible,hatchback
drive_wheels,rwd,rwd,rwd
engine_location,front,front,front
wheel_base,88.6,88.6,94.5


In [36]:
#To get numeric features only
import_numeric_dataFrame = import_dataFrame.select("wheel_base","length", "width", "height", "curb_weight", "engine_size", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price")
import_numeric_dataFrame.show(5)


+----------+------+-----+------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+
|wheel_base|length|width|height|curb_weight|engine_size|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|price|
+----------+------+-----+------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+-----+
|      88.6| 168.8| 64.1|  48.8|       2548|        130|3.47|  2.68|              9.0|       111|    5000|      21|         27|13495|
|      88.6| 168.8| 64.1|  48.8|       2548|        130|3.47|  2.68|              9.0|       111|    5000|      21|         27|16500|
|      94.5| 171.2| 65.5|  52.4|       2823|        152|2.68|  3.47|              9.0|       154|    5000|      19|         26|16500|
|      99.8| 176.6| 66.2|  54.3|       2337|        109|3.19|  3.40|             10.0|       102|    5500|      24|         30|13950|
|      99.4| 176.6| 66.4|  54.3|       2824|        136|3.19| 

In [37]:
#To Change numeric features data type into integer
for clmn in import_numeric_dataFrame.columns:
    print(clmn)
    import_numeric_dataFrame = import_numeric_dataFrame.withColumn(clmn, import_numeric_dataFrame[clmn].cast(IntegerType()))
    import_numeric_dataFrame = import_numeric_dataFrame.filter(import_numeric_dataFrame[clmn].isNotNull())
   

wheel_base
length
width
height
curb_weight
engine_size
bore
stroke
compression_ratio
horsepower
peak_rpm
city_mpg
highway_mpg
price


In [38]:
corrdf = import_numeric_dataFrame.toPandas()

corr = corrdf.corr()
print(corr['price'].sort_values(ascending=False),"\n")

price                1.000000
engine_size          0.888942
curb_weight          0.835729
horsepower           0.811027
width                0.748977
length               0.695721
wheel_base           0.588325
bore                 0.271272
height               0.128472
compression_ratio    0.073514
stroke              -0.027514
peak_rpm            -0.104333
city_mpg            -0.702685
highway_mpg         -0.715590
Name: price, dtype: float64 



In [39]:
#To chnage price into label
import_numeric_dataFrame2 = import_numeric_dataFrame.select("engine_size","curb_weight","horsepower", "width", "length", "wheel_base", "price")
import_numeric_dataFrame2 = import_numeric_dataFrame2.withColumnRenamed("price", "label")
import_numeric_dataFrame2.show(5)

+-----------+-----------+----------+-----+------+----------+-----+
|engine_size|curb_weight|horsepower|width|length|wheel_base|label|
+-----------+-----------+----------+-----+------+----------+-----+
|        130|       2548|       111|   64|   168|        88|13495|
|        130|       2548|       111|   64|   168|        88|16500|
|        152|       2823|       154|   65|   171|        94|16500|
|        109|       2337|       102|   66|   176|        99|13950|
|        136|       2824|       115|   66|   176|        99|17450|
+-----------+-----------+----------+-----+------+----------+-----+
only showing top 5 rows



In [40]:

#T generate the vector assembler for numeric features, I choose the first five columns
import_dataFrame_vector = VectorAssembler(inputCols=import_numeric_dataFrame2.columns[:6], outputCol="features")
import_numeric_dataFrame2 = import_dataFrame_vector.transform(import_numeric_dataFrame2)
import_numeric_dataFrame2.show(10)

+-----------+-----------+----------+-----+------+----------+-----+--------------------+
|engine_size|curb_weight|horsepower|width|length|wheel_base|label|            features|
+-----------+-----------+----------+-----+------+----------+-----+--------------------+
|        130|       2548|       111|   64|   168|        88|13495|[130.0,2548.0,111...|
|        130|       2548|       111|   64|   168|        88|16500|[130.0,2548.0,111...|
|        152|       2823|       154|   65|   171|        94|16500|[152.0,2823.0,154...|
|        109|       2337|       102|   66|   176|        99|13950|[109.0,2337.0,102...|
|        136|       2824|       115|   66|   176|        99|17450|[136.0,2824.0,115...|
|        136|       2507|       110|   66|   177|        99|15250|[136.0,2507.0,110...|
|        136|       2844|       110|   71|   192|       105|17710|[136.0,2844.0,110...|
|        136|       2954|       110|   71|   192|       105|18920|[136.0,2954.0,110...|
|        131|       3086|       

In [41]:
# To get the label and features vector
model_data = import_numeric_dataFrame2.select("label", "features")

model_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|13495|[130.0,2548.0,111...|
|16500|[130.0,2548.0,111...|
|16500|[152.0,2823.0,154...|
|13950|[109.0,2337.0,102...|
|17450|[136.0,2824.0,115...|
|15250|[136.0,2507.0,110...|
|17710|[136.0,2844.0,110...|
|18920|[136.0,2954.0,110...|
|23875|[131.0,3086.0,140...|
|16430|[108.0,2395.0,101...|
|16925|[108.0,2395.0,101...|
|20970|[164.0,2710.0,121...|
|21105|[164.0,2765.0,121...|
|24565|[164.0,3055.0,121...|
|30760|[209.0,3230.0,182...|
|41315|[209.0,3380.0,182...|
|36880|[209.0,3505.0,182...|
| 5151|[61.0,1488.0,48.0...|
| 6295|[90.0,1874.0,70.0...|
| 6575|[90.0,1909.0,70.0...|
+-----+--------------------+
only showing top 20 rows



In [42]:
lr_model = LinearRegression(maxIter=15, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr_model.fit(model_data)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [98.06590466999978,1.9003922906121002,45.396046650274066,555.6443493129233,-40.452859787319746,72.76392861060901]
Intercept: -45321.923161503844
numIterations: 16
objectiveHistory: [0.5, 0.41177694363371325, 0.18798395829106637, 0.10436499176095039, 0.09970650891295046, 0.08938810308287624, 0.08847023781306598, 0.08885369728352192, 0.0867261472448236, 0.08671520603036441, 0.08670894720463505, 0.08670698197684588, 0.08669140027135974, 0.08668603506121145, 0.086683410405547, 0.08668228478313694]
+-------------------+
|          residuals|
+-------------------+
| 1018.8111902528544|
| 4023.8111902528544|
|-1479.1459399818777|
| 2754.6733464504796|
| 2091.2542683788342|
|  761.1117175415638|
| -27.33290581318579|
|  973.6239422194849|
|  4806.220283700452|
|  6233.673386319882|
|  6728.673386319882|
|  3775.438220251599|
| 3805.9166442679343|
|  5983.873501378519|
|  4664.180294704696|
|  14540.28854094923|
|  7853.270395558982|
|  5446.075562492995|
|  476.7232940007525|
|  