In [42]:
#To import all required modules
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import os
import pandas as pd
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer




In [3]:
spark = SparkSession.builder.appName("LogisticRegression").getOrCreate()

In [4]:
# Loading the dataset
import_dataFrame = spark.read.load("dataset/import.csv", format="csv", header=True, inferSchema=True, delimiter=",")
# Check the type of import_dataFrame
print("The type of import_dataFrame is", type(import_dataFrame))
#To show the first 3 rows
pd.DataFrame(import_dataFrame.take(3), columns=import_dataFrame.columns).transpose()

The type of import_dataFrame is <class 'pyspark.sql.dataframe.DataFrame'>


Unnamed: 0,0,1,2
symboling,3,3,1
normalized_losses,?,?,?
make,alfa-romero,alfa-romero,alfa-romero
fuel_type,gas,gas,gas
aspiration,std,std,std
num_of_doors,two,two,two
body_style,convertible,convertible,hatchback
drive_wheels,rwd,rwd,rwd
engine_location,front,front,front
wheel_base,88.6,88.6,94.5


In [64]:
#To get numeric features only
import_numeric_dataFrame = import_dataFrame.select("wheel_base","length", "width", "height", "curb_weight", "engine_size", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "num_of_cylinders")
import_numeric_dataFrame.show(5)


+----------+------+-----+------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+----------------+
|wheel_base|length|width|height|curb_weight|engine_size|bore|stroke|compression_ratio|horsepower|peak_rpm|city_mpg|highway_mpg|num_of_cylinders|
+----------+------+-----+------+-----------+-----------+----+------+-----------------+----------+--------+--------+-----------+----------------+
|      88.6| 168.8| 64.1|  48.8|       2548|        130|3.47|  2.68|              9.0|       111|    5000|      21|         27|            four|
|      88.6| 168.8| 64.1|  48.8|       2548|        130|3.47|  2.68|              9.0|       111|    5000|      21|         27|            four|
|      94.5| 171.2| 65.5|  52.4|       2823|        152|2.68|  3.47|              9.0|       154|    5000|      19|         26|             six|
|      99.8| 176.6| 66.2|  54.3|       2337|        109|3.19|  3.40|             10.0|       102|    5500|      24|         30|   

In [65]:
#To Change numeric features data type into integer
for clmn in import_numeric_dataFrame.columns[:13]:
    print(clmn)
    import_numeric_dataFrame = import_numeric_dataFrame.withColumn(clmn, import_numeric_dataFrame[clmn].cast(IntegerType()))
    import_numeric_dataFrame = import_numeric_dataFrame.filter(import_numeric_dataFrame[clmn].isNotNull())


wheel_base
length
width
height
curb_weight
engine_size
bore
stroke
compression_ratio
horsepower
peak_rpm
city_mpg
highway_mpg


In [66]:
#To select columns we  are going to use in our model
import_numeric_dataFrame2 = import_numeric_dataFrame.select("engine_size","curb_weight","width","length","horsepower", "city_mpg", "highway_mpg", "num_of_cylinders")

import_numeric_dataFrame2.show(5)

+-----------+-----------+-----+------+----------+--------+-----------+----------------+
|engine_size|curb_weight|width|length|horsepower|city_mpg|highway_mpg|num_of_cylinders|
+-----------+-----------+-----+------+----------+--------+-----------+----------------+
|        130|       2548|   64|   168|       111|      21|         27|            four|
|        130|       2548|   64|   168|       111|      21|         27|            four|
|        152|       2823|   65|   171|       154|      19|         26|             six|
|        109|       2337|   66|   176|       102|      24|         30|            four|
|        136|       2824|   66|   176|       115|      18|         22|            five|
+-----------+-----------+-----+------+----------+--------+-----------+----------------+
only showing top 5 rows



In [68]:
indexer = StringIndexer(inputCol="num_of_cylinders", outputCol="label") 
import_numeric_dataFrame3 = indexer.fit(import_numeric_dataFrame2).transform(import_numeric_dataFrame2)
#T generate the vector assembler for numeric features, I choose the first five columns
import_dataFrame_vector = VectorAssembler(inputCols=import_numeric_dataFrame3.columns[:7], outputCol="features")
import_numeric_dataFrame3 = import_dataFrame_vector.transform(import_numeric_dataFrame3)
import_numeric_dataFrame3.show(10)

+-----------+-----------+-----+------+----------+--------+-----------+----------------+-----+--------------------+
|engine_size|curb_weight|width|length|horsepower|city_mpg|highway_mpg|num_of_cylinders|label|            features|
+-----------+-----------+-----+------+----------+--------+-----------+----------------+-----+--------------------+
|        130|       2548|   64|   168|       111|      21|         27|            four|  0.0|[130.0,2548.0,64....|
|        130|       2548|   64|   168|       111|      21|         27|            four|  0.0|[130.0,2548.0,64....|
|        152|       2823|   65|   171|       154|      19|         26|             six|  1.0|[152.0,2823.0,65....|
|        109|       2337|   66|   176|       102|      24|         30|            four|  0.0|[109.0,2337.0,66....|
|        136|       2824|   66|   176|       115|      18|         22|            five|  2.0|[136.0,2824.0,66....|
|        136|       2507|   66|   177|       110|      19|         25|          

In [70]:
# To get the label and features vector
model_data = import_numeric_dataFrame3.select("label", "features")

model_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|[130.0,2548.0,64....|
|  0.0|[130.0,2548.0,64....|
|  1.0|[152.0,2823.0,65....|
|  0.0|[109.0,2337.0,66....|
|  2.0|[136.0,2824.0,66....|
|  2.0|[136.0,2507.0,66....|
|  2.0|[136.0,2844.0,71....|
|  2.0|[136.0,2954.0,71....|
|  2.0|[131.0,3086.0,71....|
|  2.0|[131.0,3053.0,67....|
|  0.0|[108.0,2395.0,64....|
|  0.0|[108.0,2395.0,64....|
|  1.0|[164.0,2710.0,64....|
|  1.0|[164.0,2765.0,64....|
|  1.0|[164.0,3055.0,66....|
|  1.0|[209.0,3230.0,66....|
|  1.0|[209.0,3380.0,67....|
|  1.0|[209.0,3505.0,70....|
|  4.0|[61.0,1488.0,60.0...|
|  0.0|[90.0,1874.0,63.0...|
+-----+--------------------+
only showing top 20 rows



In [71]:
LogisticRegression_model = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, family="multinomial")

# Fit the model
LogisticRegression_model = LogisticRegression_model.fit(model_data)

In [72]:
trainingSummary = LogisticRegression_model.summary
# for multiclass, we can inspect metrics on a per-label basis

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Accuracy: 0.7889447236180904
FPR: 0.7889447236180904
TPR: 0.7889447236180904
F-measure: 0.6958669753260686
Precision: 0.6224337769248252
Recall: 0.7889447236180904
