## PySpark - Classification

In [1]:
#Lets import PySpark
from pyspark.sql import SparkSession

In [2]:
#Lets start a spark session
spark = SparkSession.builder.appName('classification').getOrCreate()

Occupancy detection dataset

https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+

In [3]:
df = spark.read.csv('../data/OccupancyDetection/data.txt'
                    ,header=True)

In [4]:
df.show()

+---+-------------------+-----------+----------------+-----+----------------+-------------------+---------+
|Ndx|               date|Temperature|        Humidity|Light|             CO2|      HumidityRatio|Occupancy|
+---+-------------------+-----------+----------------+-----+----------------+-------------------+---------+
|  1|2015-02-04 17:51:00|      23.18|          27.272|  426|          721.25|0.00479298817650529|        1|
|  2|2015-02-04 17:51:59|      23.15|         27.2675|429.5|             714|0.00478344094931065|        1|
|  3|2015-02-04 17:53:00|      23.15|          27.245|  426|           713.5|0.00477946352442199|        1|
|  4|2015-02-04 17:54:00|      23.15|            27.2|  426|          708.25|0.00477150882608175|        1|
|  5|2015-02-04 17:55:00|       23.1|            27.2|  426|           704.5|0.00475699293331518|        1|
|  6|2015-02-04 17:55:59|       23.1|            27.2|  419|             701|0.00475699293331518|        1|
|  7|2015-02-04 17:57:00|   

In [5]:
#printSchema
df.printSchema()

root
 |-- Ndx: string (nullable = true)
 |-- date: string (nullable = true)
 |-- Temperature: string (nullable = true)
 |-- Humidity: string (nullable = true)
 |-- Light: string (nullable = true)
 |-- CO2: string (nullable = true)
 |-- HumidityRatio: string (nullable = true)
 |-- Occupancy: string (nullable = true)



In [6]:
#Convert the data to the types we want
from pyspark.sql.types import (StructField, 
                               StringType, 
                               IntegerType,
                               DateType,
                               DoubleType,
                               StructType)

In [7]:
data_schema = [StructField('Ndx', StringType(), True), 
               StructField('date', StringType(), True),
               StructField('Temperature', DoubleType(), True),
               StructField('Humidity', DoubleType(), True),
               StructField('Light', DoubleType(), True),
               StructField('CO2', DoubleType(), True),
               StructField('HumidityRatio', DoubleType(), True),
               StructField('Occupancy', IntegerType(), True)
              ]
final_struct = StructType(fields=data_schema)

Reload the data again with correct data types in schema

In [8]:
df = spark.read.csv('../data/OccupancyDetection/data.txt'
                    ,header=True
                    ,schema=final_struct)

In [9]:
df.show()

+---+-------------------+-----------+----------------+-----+----------------+-------------------+---------+
|Ndx|               date|Temperature|        Humidity|Light|             CO2|      HumidityRatio|Occupancy|
+---+-------------------+-----------+----------------+-----+----------------+-------------------+---------+
|  1|2015-02-04 17:51:00|      23.18|          27.272|426.0|          721.25|0.00479298817650529|        1|
|  2|2015-02-04 17:51:59|      23.15|         27.2675|429.5|           714.0|0.00478344094931065|        1|
|  3|2015-02-04 17:53:00|      23.15|          27.245|426.0|           713.5|0.00477946352442199|        1|
|  4|2015-02-04 17:54:00|      23.15|            27.2|426.0|          708.25|0.00477150882608175|        1|
|  5|2015-02-04 17:55:00|       23.1|            27.2|426.0|           704.5|0.00475699293331518|        1|
|  6|2015-02-04 17:55:59|       23.1|            27.2|419.0|           701.0|0.00475699293331518|        1|
|  7|2015-02-04 17:57:00|   

In [10]:
df.printSchema()

root
 |-- Ndx: string (nullable = true)
 |-- date: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Light: double (nullable = true)
 |-- CO2: double (nullable = true)
 |-- HumidityRatio: double (nullable = true)
 |-- Occupancy: integer (nullable = true)



In [11]:
#Convert Date to dateType in PySpark
from pyspark.sql.functions import to_timestamp
df = df.withColumn('New_datetime', to_timestamp(df['date'],format='yyyy-MM-dd HH:mm:SS'))

#Drop old date column
df = df.drop(df['date'])
#rename New_date as date
df = df.withColumnRenamed('New_datetime', 'date')
df.printSchema()

root
 |-- Ndx: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Light: double (nullable = true)
 |-- CO2: double (nullable = true)
 |-- HumidityRatio: double (nullable = true)
 |-- Occupancy: integer (nullable = true)
 |-- date: timestamp (nullable = true)



In [12]:
#Import vector and VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [13]:
#Assemble the features vector
assembler = VectorAssembler(inputCols=['Temperature',
                                       'Humidity',
                                       'Light',
                                       'CO2', 
                                       'HumidityRatio'], 
                           outputCol='features')

In [14]:
output = assembler.transform(df)

In [15]:
df_final = output.select(['features', 'Occupancy'])
#Rename the Occupancy as label
df_final = df_final.withColumnRenamed('Occupancy', 'label')
df_final.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[23.18,27.272,426...|    1|
|[23.15,27.2675,42...|    1|
|[23.15,27.245,426...|    1|
|[23.15,27.2,426.0...|    1|
|[23.1,27.2,426.0,...|    1|
|[23.1,27.2,419.0,...|    1|
|[23.1,27.2,419.0,...|    1|
|[23.1,27.2,419.0,...|    1|
|[23.1,27.2,419.0,...|    1|
|[23.075,27.175,41...|    1|
|[23.075,27.15,419...|    1|
|[23.1,27.1,419.0,...|    1|
|[23.1,27.16666666...|    1|
|[23.05,27.15,419....|    1|
|[23.0,27.125,419....|    1|
|[23.0,27.125,418....|    1|
|[23.0,27.2,0.0,68...|    0|
|[22.945,27.29,0.0...|    0|
|[22.945,27.39,0.0...|    0|
|[22.89,27.39,0.0,...|    0|
+--------------------+-----+
only showing top 20 rows



In [16]:
#Train test split
train_data, test_data = df_final.randomSplit([0.7, 0.3])

### Logistic regression

In [17]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression()
lr_model = lr.fit(train_data)
test_result = lr_model.transform(test_data)
train_result = lr_model.transform(train_data)

In [18]:
#import multi class classification evaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
#Evaluate the model
me = MulticlassClassificationEvaluator(metricName='accuracy')

In [20]:
print(f"Test result = {me.evaluate(test_result):0.3}")
print(f"Train result = {me.evaluate(train_result):0.3}")

Test result = 0.983
Train result = 0.988


### Support Vector classifier

In [21]:
from pyspark.ml.classification import LinearSVC

lr = LinearSVC()
lr_model = lr.fit(train_data)
test_result = lr_model.transform(test_data)
train_result = lr_model.transform(train_data)


In [22]:
#Evaluate the model
me = MulticlassClassificationEvaluator(metricName='accuracy')
print(f"Test result = {me.evaluate(test_result):0.3}")
print(f"Train result = {me.evaluate(train_result):0.3}")

Test result = 0.985
Train result = 0.989


### Decision tree classifier


In [23]:
from pyspark.ml.classification import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc_model = dtc.fit(train_data)
test_result = dtc_model.transform(test_data)
train_result = dtc_model.transform(train_data)

#Evaluate the model
me = MulticlassClassificationEvaluator(metricName='accuracy')
print(f"Test result = {me.evaluate(test_result):0.3}")
print(f"Train result = {me.evaluate(train_result):0.3}")

Test result = 0.987
Train result = 0.992


In [24]:
from pyspark.ml.classification import RandomForestClassifier

dtc = RandomForestClassifier(numTrees=100)
dtc_model = dtc.fit(train_data)
test_result = dtc_model.transform(test_data)
train_result = dtc_model.transform(train_data)

#Evaluate the model
me = MulticlassClassificationEvaluator(metricName='accuracy')
print(f"Test result = {me.evaluate(test_result):0.3}")
print(f"Train result = {me.evaluate(train_result):0.3}")

Test result = 0.987
Train result = 0.991


In [25]:
from pyspark.ml.classification import GBTClassifier

dtc = GBTClassifier()
dtc_model = dtc.fit(train_data)
test_result = dtc_model.transform(test_data)
train_result = dtc_model.transform(train_data)

#Evaluate the model
me = MulticlassClassificationEvaluator(metricName='accuracy')
print(f"Test result = {me.evaluate(test_result):0.3}")
print(f"Train result = {me.evaluate(train_result):0.3}")

Test result = 0.991
Train result = 0.996


We can see Gradient boosted trees performed the best