#1.Build spark session

In [94]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [95]:
from pyspark.sql.functions import when

In [96]:
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

#2.Import dataset

In [97]:
df_train = spark.read.option("inferSchema", "true").csv("Car_Classification_Train.csv", header=True)

In [98]:
df_train.show(5)

+---------+----------+-------+---------+------------+-----------+-------+----+
|    Brand|Model Year|Mileage|Fuel Type|Transmission|Owner Count|  Price|Sold|
+---------+----------+-------+---------+------------+-----------+-------+----+
|    Honda|      2019|  99704|   Petrol|   Automatic|          4|37884.0| Yes|
|   Toyota|      2015|  11801|   Hybrid|   Automatic|          3|29737.0| Yes|
|   Toyota|      2015|  74042|   Petrol|   Automatic|          3|34273.0| Yes|
|   Toyota|      2007|  25056|   Petrol|   Automatic|          2|37016.0|  No|
|Chevrolet|      2019|  12543|   Hybrid|      Manual|          1|22256.0| Yes|
+---------+----------+-------+---------+------------+-----------+-------+----+
only showing top 5 rows



In [99]:
df_test = spark.read.option("inferSchema", "true").csv("Car_Classification_Test.csv", header=True)

In [100]:
df_test.show(5)

+---------+----------+-------+---------+------------+-----------+-------+----+
|    Brand|Model Year|Mileage|Fuel Type|Transmission|Owner Count|  Price|Sold|
+---------+----------+-------+---------+------------+-----------+-------+----+
|      BMW|      2018| 156836|     null|   Automatic|          3|20368.0| Yes|
|     null|      2018| 105235|   Diesel|   Automatic|          3|   null| Yes|
|      BMW|      2002| 149572|   Petrol|      Manual|          4| 7018.0| Yes|
|   Toyota|      2007| 124180| Electric|   Automatic|          1|   null| Yes|
|Chevrolet|      2009|  23888|   Diesel|   Automatic|          2|14694.0| Yes|
+---------+----------+-------+---------+------------+-----------+-------+----+
only showing top 5 rows



In [101]:
df_train.summary().show()

+-------+------+-----------------+------------------+---------+------------+------------------+------------------+----+
|summary| Brand|       Model Year|           Mileage|Fuel Type|Transmission|       Owner Count|             Price|Sold|
+-------+------+-----------------+------------------+---------+------------+------------------+------------------+----+
|  count|   146|              160|               160|      145|         142|               160|               144| 160|
|   mean|  null|         2010.875|       99960.04375|     null|        null|               2.5|25445.520833333332|null|
| stddev|  null|7.181168595008502|55884.347033407095|     null|        null|1.1159224968016033|13945.539691466813|null|
|    min|   BMW|             2000|              8373|   Diesel|   Automatic|                 1|            3077.0|  No|
|    25%|  null|             2004|             49425|     null|        null|                 2|           12916.0|null|
|    50%|  null|             2011|      

#3.Feature Selection

In [102]:
# kita pilih fitur/kolom yang bakal kita pake buat prediksi klasifikasinya
# select fitur yang kita mau pake + label prediksi ("Sold")
df_train = df_train.select("Mileage", "Fuel Type", "Transmission", "Sold")
df_test = df_test.select("Mileage", "Fuel Type", "Transmission", "Sold")

# 4. Handle Missing Data

In [103]:
df_train = df_train.na.drop()
df_test = df_test.na.drop()

# 5. Transform Data

In [104]:
df_train = df_train.withColumn("Fuel Type", when(df_train["Fuel Type"] == "Diesel", 1)\
                               .when(df_train["Fuel Type"] == "Petrol", 2)\
                               .when(df_train["Fuel Type"] == "Hybrid", 3)\
                               .when(df_train["Fuel Type"] == "Electric", 4))

df_test = df_test.withColumn("Transmission", when(df_test["Transmission"] == "Automatic", 1)\
                               .when(df_test["Transmission"] == "Manual", 2))

In [105]:
df_test = df_test.withColumn("Fuel Type", when(df_test["Fuel Type"] == "Diesel", 1)\
                               .when(df_test["Fuel Type"] == "Petrol", 2)\
                               .when(df_test["Fuel Type"] == "Hybrid", 3)\
                               .when(df_test["Fuel Type"] == "Electric", 4))

df_train = df_train.withColumn("Transmission", when(df_train["Transmission"] == "Automatic", 1)\
                               .when(df_train["Transmission"] == "Manual", 2))

df_test = df_test.withColumn("Sold", when(df_test["Sold"] == "Yes", 1)\
                               .when(df_test["Sold"] == "No", 2))

In [106]:
df_train.show(5)

+-------+---------+------------+----+
|Mileage|Fuel Type|Transmission|Sold|
+-------+---------+------------+----+
|  99704|        2|           1| Yes|
|  11801|        3|           1| Yes|
|  74042|        2|           1| Yes|
|  25056|        2|           1|  No|
|  12543|        3|           2| Yes|
+-------+---------+------------+----+
only showing top 5 rows



# 6. Normalization

In [107]:
# NOTE: jangan normalisasi/scale kolom yang kita mau cari ("Sold")
cols = df_train.columns
cols.remove("Sold")
print(cols)

['Mileage', 'Fuel Type', 'Transmission']


In [108]:
# VectorAssembler(InputCols, outputCol)
df_train = VectorAssembler(inputCols = cols, outputCol = "features").transform(df_train)
df_test = VectorAssembler(inputCols = cols, outputCol = "features").transform(df_test)

In [109]:
df_train.show(5)

+-------+---------+------------+----+-----------------+
|Mileage|Fuel Type|Transmission|Sold|         features|
+-------+---------+------------+----+-----------------+
|  99704|        2|           1| Yes|[99704.0,2.0,1.0]|
|  11801|        3|           1| Yes|[11801.0,3.0,1.0]|
|  74042|        2|           1| Yes|[74042.0,2.0,1.0]|
|  25056|        2|           1|  No|[25056.0,2.0,1.0]|
|  12543|        3|           2| Yes|[12543.0,3.0,2.0]|
+-------+---------+------------+----+-----------------+
only showing top 5 rows



In [110]:
scaler = StandardScaler(inputCol="features", outputCol="scaled_features").fit(df_train)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f1938021440>
Traceback (most recent call last):
  File "/home/cloudera/Downloads/]/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'BinaryClassificationEvaluator' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7f1938021440>
Traceback (most recent call last):
  File "/home/cloudera/Downloads/]/lib/python3.7/site-packages/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LogisticRegression' object has no attribute '_java_obj'


In [111]:
df_train = scaler.transform(df_train)
df_test = scaler.transform(df_test)

In [112]:
df_train.select("features", "scaled_features").show(10, False)

+------------------+-----------------------------------------------------------+
|features          |scaled_features                                            |
+------------------+-----------------------------------------------------------+
|[99704.0,2.0,1.0] |[1.7285921935589388,1.8513922928083015,1.9971315421898408] |
|[11801.0,3.0,1.0] |[0.20459677120465614,2.777088439212452,1.9971315421898408] |
|[74042.0,2.0,1.0] |[1.2836839364066732,1.8513922928083015,1.9971315421898408] |
|[25056.0,2.0,1.0] |[0.4344018896113774,1.8513922928083015,1.9971315421898408] |
|[12543.0,3.0,2.0] |[0.21746100340818592,2.777088439212452,3.9942630843796816] |
|[107634.0,4.0,1.0]|[1.8660765080791424,3.702784585616603,1.9971315421898408]  |
|[139415.0,4.0,1.0]|[2.4170713378101123,3.702784585616603,1.9971315421898408]  |
|[21896.0,1.0,1.0] |[0.37961621068529366,0.9256961464041508,1.9971315421898408]|
|[172280.0,2.0,2.0]|[2.986859735881549,1.8513922928083015,3.9942630843796816]  |
|[25358.0,3.0,1.0] |[0.43963

# 7. Model Training

In [113]:
model = LogisticRegression(featuresCol="scaled_features", labelCol="Sold", maxIter = 10).fit(df_train)

IllegalArgumentException: 'requirement failed: Column Sold must be of type numeric but was actually of type string.'

# 8. Model Testing

In [None]:
prediction = model.transform(df_test)

In [None]:
prediction.select("Sold", "scaled_features", "prediction").show(15, False)

# 9. Model Evaluation

In [None]:
# BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol = "Sold")

In [None]:
print("Accuracy: {}%", format(evaluator.evaluate(prediction) * 100))

Kalo accuracy kurang dari minimal soal:
    1. ubah feature yang kalian pilih
    2. ubah transform datanya (misal Petrol = 1, Petrol 2)
    3. ubah urutan transform datanya(misal Petrol = 1, electric = 2, electric = 1, Petrol = 2)