In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

!wget -q https://archive.apache.org/dist/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz

!tar xf spark-3.2.1-bin-hadoop3.2.tgz

!pip install -q findspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 49.3 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=d99a1351aba922c5ea4baba0228f1accd841f9c7c996ebd8c95df224146293db
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"

from pyspark.sql.functions import *
from pyspark.sql import functions as f
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import scipy

spark = (SparkSession
        .builder
        .appName('lab 3 - Desicion tree')
        .getOrCreate()
)

# IRIS DATASET

In [4]:
df = spark.read.csv('Iris.csv', header = True, inferSchema = True)

In [5]:
df.show(5)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|  4|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|  5|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



### PREPROCESSING DATA

We are combining features into one single vector and renaming 'Species' to 'label'

In [6]:
df = (StringIndexer(inputCol="Species",outputCol="label")
          .fit(df)
          .transform(VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm","PetalLengthCm","PetalWidthCm"],outputCol='features')
                         .transform(df))
          .drop('Id','SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species')
          .distinct())

The data after being pre-processed

In [7]:
df.show()

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[7.7,2.6,6.9,2.3]|  2.0|
|[4.9,2.4,3.3,1.0]|  1.0|
|[7.7,3.8,6.7,2.2]|  2.0|
|[5.5,2.3,4.0,1.3]|  1.0|
|[5.4,3.7,1.5,0.2]|  0.0|
|[5.7,2.8,4.5,1.3]|  1.0|
|[6.0,2.2,5.0,1.5]|  2.0|
|[5.4,3.9,1.3,0.4]|  0.0|
|[6.7,3.3,5.7,2.5]|  2.0|
|[5.8,4.0,1.2,0.2]|  0.0|
|[5.1,3.7,1.5,0.4]|  0.0|
|[4.8,3.4,1.9,0.2]|  0.0|
|[5.5,2.5,4.0,1.3]|  1.0|
|[6.6,3.0,4.4,1.4]|  1.0|
|[6.4,3.2,5.3,2.3]|  2.0|
|[6.5,3.0,5.8,2.2]|  2.0|
|[5.4,3.4,1.5,0.4]|  0.0|
|[5.6,2.8,4.9,2.0]|  2.0|
|[5.2,4.1,1.5,0.1]|  0.0|
|[5.9,3.0,4.2,1.5]|  1.0|
+-----------------+-----+
only showing top 20 rows



### TRAINING MODEL

We are creating a train function to reduce code repeating

In [8]:
def train(df, classifier):
  (train, test) = df.randomSplit([.7,.3])

  model = classifier.fit(train)

  pred = model.transform(test)

  eval_accuracy = (MulticlassClassificationEvaluator
        (labelCol="label", predictionCol="prediction", metricName="accuracy"))
  
  eval_precision = (MulticlassClassificationEvaluator
        (labelCol="label", predictionCol="prediction", metricName="weightedPrecision"))
  
  eval_recall = (MulticlassClassificationEvaluator
        (labelCol="label", predictionCol="prediction", metricName="weightedRecall"))
  
  eval_f1 = (MulticlassClassificationEvaluator
        (labelCol="label", predictionCol="prediction", metricName="f1"))

  accuracy = eval_accuracy.evaluate(pred)

  precision =  eval_precision.evaluate(pred)

  recall =  eval_recall.evaluate(pred)

  f1 =  eval_f1.evaluate(pred)

  print(f"""
  Accuracy  = {accuracy}
  Error     = {1-accuracy}
  Precision = {precision}
  Recall    = {recall}
  F1        = {f1}""")

  return model, pred

In [12]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

_ , pred = train(df,rf)

pred.select("prediction", "label", "features").show()


  Accuracy  = 0.9791666666666666
  Error     = 0.02083333333333337
  Precision = 0.9805555555555555
  Recall    = 0.9791666666666667
  F1        = 0.979244330537434
+----------+-----+-----------------+
|prediction|label|         features|
+----------+-----+-----------------+
|       0.0|  0.0|[4.6,3.2,1.4,0.2]|
|       0.0|  0.0|[4.6,3.4,1.4,0.3]|
|       0.0|  0.0|[4.8,3.4,1.6,0.2]|
|       0.0|  0.0|[4.8,3.4,1.9,0.2]|
|       0.0|  0.0|[4.9,3.0,1.4,0.2]|
|       0.0|  0.0|[5.0,3.0,1.6,0.2]|
|       0.0|  0.0|[5.0,3.4,1.5,0.2]|
|       0.0|  0.0|[5.0,3.6,1.4,0.2]|
|       1.0|  1.0|[5.1,2.5,3.0,1.1]|
|       0.0|  0.0|[5.1,3.5,1.4,0.3]|
|       0.0|  0.0|[5.2,3.4,1.4,0.2]|
|       0.0|  0.0|[5.2,3.5,1.5,0.2]|
|       0.0|  0.0|[5.4,3.4,1.5,0.4]|
|       0.0|  0.0|[5.4,3.7,1.5,0.2]|
|       1.0|  1.0|[5.5,2.3,4.0,1.3]|
|       1.0|  1.0|[5.5,2.5,4.0,1.3]|
|       0.0|  0.0|[5.5,4.2,1.4,0.2]|
|       2.0|  2.0|[5.6,2.8,4.9,2.0]|
|       1.0|  1.0|[5.6,2.9,3.6,1.3]|
|       1.0|  1.0|[5

# BANK DATASET

In [None]:
df = spark.read.option("delimiter", ";").csv('bank.csv', header = True, inferSchema = True)

In [None]:
df.show(10)

+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|          job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30|   unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|     services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35|   management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30|   management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|  blue-collar|married|secondary|     no|      0|    yes|  no| unknown| 

### PREPROCESSING
We are doing as follow:
* Dropping 'day' and 'month' as it does not make much senses in this scenario
* Changing all nominal features into numeric type
* Combining all features into one vector
* Dropping everything, leaving 'label' and 'features' alone in the dataset

In [None]:
df = df.drop('day','month')

df = StringIndexer(inputCols=['job','marital','education','default','housing','loan','contact','poutcome','y'],
                   outputCols=['job_','marital_','education_','default_','housing_','loan_','contact_','poutcome_','label']).fit(df).transform(df)

df = df.drop('job','marital','education','default','housing','loan','contact','poutcome','y')

df = VectorAssembler(inputCols=['campaign','balance','duration','pdays','previous','age','job_','marital_','education_','default_','housing_','loan_','contact_','poutcome_'],outputCol='features').transform(df)

df = df.drop('campaign','balance','duration','pdays','previous','age','job_','marital_','education_','default_','housing_','loan_','contact_','poutcome_')

The data after being pre-processed

In [None]:
df.show(5,truncate=False)

+-----+-----------------------------------------------------------------+
|label|features                                                         |
+-----+-----------------------------------------------------------------+
|0.0  |(14,[0,1,2,3,5,6,8,10],[1.0,1787.0,79.0,-1.0,30.0,8.0,2.0,1.0])  |
|0.0  |[1.0,4789.0,220.0,339.0,4.0,33.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0]|
|0.0  |[1.0,1350.0,185.0,330.0,1.0,35.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0]|
|0.0  |(14,[0,1,2,3,5,8,11,12],[4.0,1476.0,199.0,-1.0,30.0,1.0,1.0,1.0])|
|0.0  |(14,[0,2,3,5,6,12],[1.0,226.0,-1.0,59.0,1.0,1.0])                |
+-----+-----------------------------------------------------------------+
only showing top 5 rows



In [None]:
model, _ = train(df,rf)


  Accuracy  = 0.8766140602582496
  Error     = 0.12338593974175038
  Precision = 0.8475885663834014
  Recall    = 0.8766140602582496
  F1        = 0.8342757476017666


As the accuracy are likely able to be improved, we will attempt to tweak a bit further

---

### Removing Redundant features and using Cross-Validation

---

Sorting the importance of features by indices, judging from the model above

In [None]:
ImportanceIndices = sorted(range(14),key=model.featureImportances.__getitem__)
ImportanceValues = sorted(model.featureImportances)

print(ImportanceIndices)
print(ImportanceValues)

[11, 9, 8, 0, 7, 12, 10, 6, 1, 5, 4, 3, 13, 2]
[0.0014649061338201752, 0.008240646134561528, 0.011759270081439238, 0.012445646481029195, 0.014984466067493669, 0.021697027225567385, 0.03899294309758794, 0.0415219628246062, 0.05401590239463329, 0.05708783136310182, 0.059666559870715806, 0.06852779066806518, 0.18474537036686034, 0.42484967729051826]


We remove at most 4% least importance features, keeping 96% of the total

In [None]:
colsToRemove = []
dv = 0

for i, v in zip(ImportanceIndices, ImportanceValues):
  dv += v
  if dv > 0.04: break
  colsToRemove.append(i)

print(colsToRemove)

df = VectorSlicer(inputCol='features', outputCol="features_", indices=[i for i in range(14) if i not in colsToRemove]).transform(df)

df.show(1)

rf = RandomForestClassifier(labelCol="label", featuresCol="features_")

[11, 9, 8, 0]
+-----+--------------------+--------------------+
|label|            features|           features_|
+-----+--------------------+--------------------+
|  0.0|(14,[0,1,2,3,5,6,...|(10,[0,1,2,4,5,7]...|
+-----+--------------------+--------------------+
only showing top 1 row



Then, we use cross validation technique to tune the model hyperparameters such as numTrees and maxDepth. Due to the nature of this dataset, we prioritize "recall" for the model evaluation. The validator will return the model with the best recall after running through 10 folds.

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator
import numpy as np

rf = RandomForestClassifier(labelCol="label", featuresCol="features")
pipeline = Pipeline(stages=[rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [int(x) for x in np.linspace(start = 10, stop = 50, num = 3)]) \
    .addGrid(rf.maxDepth, [int(x) for x in np.linspace(start = 5, stop = 25, num = 3)]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                      estimatorParamMaps=paramGrid,
                      evaluator=MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall"),
                      numFolds=10)

train(df, crossval)


  Accuracy  = 0.8920327624720774
  Error     = 0.10796723752792259
  Precision = 0.8738300275767517
  Recall    = 0.8920327624720774
  F1        = 0.8778286768070956


(CrossValidatorModel_b0baab70487e,
 DataFrame[label: double, features: vector, features_: vector, rawPrediction: vector, probability: vector, prediction: double])

The accuracy after all did get 1.2% better!