# PySpark Exercises 3 
---
Özgün Yargı

## Install Dependencies

In [5]:
!pip install pyspark



## Import Libraries

In [6]:
from tqdm import tqdm

from pyspark.sql import SparkSession

# Preprocessing tools
import pyspark.sql.functions as F
from pyspark.sql.functions import isnan
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler

# Model tools
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier

# Evaluation tools
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

## Get the Data

In [7]:
DATA = "leaf.csv"

In [8]:
spark = SparkSession.builder.appName('Leaf').getOrCreate()

df = spark.read.csv(DATA,header=False)

new_names = ["label", "Specimen Number", "Eccentricity", "Aspect Ratio", "Elongation", "Solidity", "Stochastic Convexity", "Isoperimetic Factor", "Maximal Indentation Depth", "Lobedness", \
             "Average Intensity", "Average Contrast", "Smoothness", "Third Moment", "Uniformity", "Entropy"]

df = df.toDF(*new_names)
df.show()

+-----+---------------+------------+------------+----------+--------+--------------------+-------------------+-------------------------+---------+-----------------+----------------+----------+------------+----------+-------+
|label|Specimen Number|Eccentricity|Aspect Ratio|Elongation|Solidity|Stochastic Convexity|Isoperimetic Factor|Maximal Indentation Depth|Lobedness|Average Intensity|Average Contrast|Smoothness|Third Moment|Uniformity|Entropy|
+-----+---------------+------------+------------+----------+--------+--------------------+-------------------+-------------------------+---------+-----------------+----------------+----------+------------+----------+-------+
|    1|              1|     0.72694|      1.4742|   0.32396| 0.98535|                   1|            0.83592|                0.0046566|0.0039465|          0.04779|         0.12795|  0.016108|   0.0052323|0.00027477| 1.1756|
|    1|              2|     0.74173|      1.5257|   0.36116| 0.98152|             0.99825|          

## Preprocessing
---
TODO:
* Check if there are object type column (Done)
* Check if there are nan values (Done)
* Vectorize the Features (Done)
* Split the data (Done)
* Normalize the data (Done)
* Construct a ML Model 
* Train the model
* Find score on validation set.

### Check Column Types

In [9]:
[df.select(column) for column in new_names]

[DataFrame[label: string],
 DataFrame[Specimen Number: string],
 DataFrame[Eccentricity: string],
 DataFrame[Aspect Ratio: string],
 DataFrame[Elongation: string],
 DataFrame[Solidity: string],
 DataFrame[Stochastic Convexity: string],
 DataFrame[Isoperimetic Factor: string],
 DataFrame[Maximal Indentation Depth: string],
 DataFrame[Lobedness: string],
 DataFrame[Average Intensity: string],
 DataFrame[Average Contrast: string],
 DataFrame[Smoothness: string],
 DataFrame[Third Moment: string],
 DataFrame[Uniformity: string],
 DataFrame[Entropy: string]]

In [10]:
for name in new_names:
  df = df.withColumn(name, df[name].cast("float"))

[df.select(column) for column in new_names]

[DataFrame[label: float],
 DataFrame[Specimen Number: float],
 DataFrame[Eccentricity: float],
 DataFrame[Aspect Ratio: float],
 DataFrame[Elongation: float],
 DataFrame[Solidity: float],
 DataFrame[Stochastic Convexity: float],
 DataFrame[Isoperimetic Factor: float],
 DataFrame[Maximal Indentation Depth: float],
 DataFrame[Lobedness: float],
 DataFrame[Average Intensity: float],
 DataFrame[Average Contrast: float],
 DataFrame[Smoothness: float],
 DataFrame[Third Moment: float],
 DataFrame[Uniformity: float],
 DataFrame[Entropy: float]]

### Check Nan Values

In [11]:
(df.count(), len(df.columns)) # Number of rows and columns

(340, 16)

In [12]:
df.select([F.count(isnan(new_names[indx])) for indx in range(1, len(new_names))]).show() # This shows that we have no nan values

+-----------------------------+--------------------------+--------------------------+------------------------+----------------------+----------------------------------+---------------------------------+---------------------------------------+-----------------------+-------------------------------+------------------------------+------------------------+--------------------------+------------------------+---------------------+
|count(isnan(Specimen Number))|count(isnan(Eccentricity))|count(isnan(Aspect Ratio))|count(isnan(Elongation))|count(isnan(Solidity))|count(isnan(Stochastic Convexity))|count(isnan(Isoperimetic Factor))|count(isnan(Maximal Indentation Depth))|count(isnan(Lobedness))|count(isnan(Average Intensity))|count(isnan(Average Contrast))|count(isnan(Smoothness))|count(isnan(Third Moment))|count(isnan(Uniformity))|count(isnan(Entropy))|
+-----------------------------+--------------------------+--------------------------+------------------------+----------------------+---------

### Vectorize Columns

In [13]:
vec = VectorAssembler(inputCols=new_names[1:],
                      outputCol='features')

df_vec = vec.transform(df)
df_vec = df_vec.select("features", "label")
df_vec.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,0.7269399762...|  1.0|
|[2.0,0.7417299747...|  1.0|
|[3.0,0.7672200202...|  1.0|
|[4.0,0.7379699945...|  1.0|
|[5.0,0.8230100274...|  1.0|
+--------------------+-----+
only showing top 5 rows



### Train Test Split

In [14]:
df_train, df_test = df_vec.randomSplit([0.8, 0.2], seed=42)

### Normalize Data

In [15]:
max_dict = {}
min_dict = {}

for name_indx in range(1, len(new_names)):
  max_dict[new_names[name_indx]] = "max"
  min_dict[new_names[name_indx]] = "min"

(df.agg(max_dict).collect(), df.agg(min_dict).collect())

([Row(max(Uniformity)=0.002935800002887845, max(Entropy)=2.7084999084472656, max(Isoperimetic Factor)=0.8581600189208984, max(Maximal Indentation Depth)=0.19898000359535217, max(Specimen Number)=16.0, max(Average Contrast)=0.28080999851226807, max(Average Intensity)=0.19066999852657318, max(Elongation)=0.9483399987220764, max(Aspect Ratio)=19.038000106811523, max(Smoothness)=0.07308900356292725, max(Eccentricity)=0.9987099766731262, max(Stochastic Convexity)=1.0, max(Lobedness)=7.206200122833252, max(Third Moment)=0.0297860000282526, max(Solidity)=0.9938799738883972)],
 [Row(min(Uniformity)=6.9241000346664805e-06, min(Entropy)=0.16940000653266907, min(Isoperimetic Factor)=0.07837600260972977, min(Maximal Indentation Depth)=0.002836500061675906, min(Specimen Number)=1.0, min(Average Contrast)=0.03341500088572502, min(Average Intensity)=0.005021899938583374, min(Elongation)=0.10761000216007233, min(Aspect Ratio)=1.006600022315979, min(Smoothness)=0.0011152999941259623, min(Eccentricity)=

In [16]:
scaler = MinMaxScaler(inputCol="features", outputCol=f"features_norm")
model = scaler.fit(df_train)
model.transform(df_train)
model.transform(df_test)

DataFrame[features: vector, label: float, features_norm: vector]

In [17]:
df_train.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1.0,0.3909299969...| 36.0|
|[1.0,0.3943200111...|  4.0|
|[1.0,0.4494099915...| 10.0|
|[1.0,0.4634799957...|  6.0|
|[1.0,0.4782100021...|  9.0|
+--------------------+-----+
only showing top 5 rows



## Create Model

### Logistic Regression

In [18]:
parameters = {"maxIter": [10,100, 500],
              "regParam": [0.0, 0.1, 0.4],
              "elasticNetParam": [0.0, 0.1, 0.4]}

result_dict = {}
value_dict = {}
pbar = tqdm(range(1,28))

for maxIter in parameters["maxIter"]:
  for regParam in parameters["regParam"]:
    for elasticNetParam in parameters["elasticNetParam"]:
      pbar.set_description(f"maxIter:{maxIter}_regParam:{regParam}_elasticNetParam:{elasticNetParam}")
      pbar.update(1)
      lr = LogisticRegression(maxIter=maxIter, regParam=regParam, elasticNetParam=elasticNetParam)
      lrModel = lr.fit(df_train) # Fit the model
      acc = lrModel.evaluate(df_test).accuracy # Evaluate
      value_dict[f"maxIter:{maxIter}_regParam:{regParam}_elasticNetParam:{elasticNetParam}"] = acc
  
sorted_dict = dict(sorted(value_dict.items(), key=lambda item: item[1], reverse=True))
result_dict["Logistic_Regression"] = sorted_dict

maxIter:500_regParam:0.4_elasticNetParam:0.4: 100%|██████████| 27/27 [01:24<00:00,  2.96s/it]

#### Evaluation

In [19]:
print("Logistic Regression\n"+"-"*len("Logistic Regression"))

for key in result_dict["Logistic_Regression"]:
  acc = result_dict["Logistic_Regression"][key]
  print(f"* {key}: {acc}")

Logistic Regression
-------------------
* maxIter:10_regParam:0.0_elasticNetParam:0.0: 0.8076923076923077
* maxIter:10_regParam:0.0_elasticNetParam:0.1: 0.8076923076923077
* maxIter:10_regParam:0.0_elasticNetParam:0.4: 0.8076923076923077
* maxIter:100_regParam:0.0_elasticNetParam:0.0: 0.75
* maxIter:100_regParam:0.0_elasticNetParam:0.1: 0.75
* maxIter:100_regParam:0.0_elasticNetParam:0.4: 0.75
* maxIter:500_regParam:0.0_elasticNetParam:0.0: 0.75
* maxIter:500_regParam:0.0_elasticNetParam:0.1: 0.75
* maxIter:500_regParam:0.0_elasticNetParam:0.4: 0.75
* maxIter:100_regParam:0.1_elasticNetParam:0.0: 0.46153846153846156
* maxIter:500_regParam:0.1_elasticNetParam:0.0: 0.46153846153846156
* maxIter:10_regParam:0.1_elasticNetParam:0.0: 0.4423076923076923
* maxIter:10_regParam:0.1_elasticNetParam:0.1: 0.28846153846153844
* maxIter:100_regParam:0.1_elasticNetParam:0.1: 0.2692307692307692
* maxIter:500_regParam:0.1_elasticNetParam:0.1: 0.2692307692307692
* maxIter:10_regParam:0.4_elasticNetParam

### Random Forest

In [20]:
parameters = {"numTrees": [10,100, 500],
              "maxDepth": [5, 10, 20],
              "impurity": ["gini", "entropy"]}

value_dict = {}
sorted_dict = {}
pbar = tqdm(range(1,19))

for numTrees in parameters["numTrees"]:
  for maxDepth in parameters["maxDepth"]:
    for impurity in parameters["impurity"]:
      pbar.set_description(f"numTrees:{numTrees}_maxDepth:{maxDepth}_impurity:{impurity}")
      pbar.update(1)
      rf = RandomForestClassifier(numTrees=numTrees, maxDepth=maxDepth, impurity=impurity)
      rfModel = rf.fit(df_train) # Fit the model
      acc = rfModel.evaluate(df_test).accuracy # Evaluate
      value_dict[f"numTrees:{numTrees}_maxDepth:{maxDepth}_impurity:{impurity}"] = acc
  
sorted_dict = dict(sorted(value_dict.items(), key=lambda item: item[1], reverse=True))
result_dict["Random_Forest"] = sorted_dict


  0%|          | 0/18 [00:00<?, ?it/s][A
maxIter:500_regParam:0.4_elasticNetParam:0.4: 100%|██████████| 27/27 [01:27<00:00,  3.25s/it]

numTrees:10_maxDepth:5_impurity:entropy:   6%|▌         | 1/18 [00:03<00:53,  3.16s/it][A
numTrees:10_maxDepth:5_impurity:entropy:  11%|█         | 2/18 [00:03<00:25,  1.58s/it][A
numTrees:10_maxDepth:10_impurity:gini:  11%|█         | 2/18 [00:04<00:25,  1.58s/it]  [A
numTrees:10_maxDepth:10_impurity:gini:  17%|█▋        | 3/18 [00:04<00:25,  1.67s/it][A
numTrees:10_maxDepth:10_impurity:entropy:  17%|█▋        | 3/18 [00:08<00:25,  1.67s/it][A
numTrees:10_maxDepth:10_impurity:entropy:  22%|██▏       | 4/18 [00:08<00:31,  2.24s/it][A
numTrees:10_maxDepth:20_impurity:gini:  22%|██▏       | 4/18 [00:10<00:31,  2.24s/it]   [A
numTrees:10_maxDepth:20_impurity:gini:  28%|██▊       | 5/18 [00:10<00:28,  2.21s/it][A
numTrees:10_maxDepth:20_impurity:entropy:  28%|██▊       | 5/18 [00:13<00:28,  2.21s/it][A
numTrees:10_maxDepth:20_impurity:entropy:  

#### Evaluation

In [21]:
print("Random Forest\n"+"-"*len("Random Forest"))

for key in result_dict["Random_Forest"]:
  acc = result_dict["Random_Forest"][key]
  print(f"* {key}: {acc}")

Random Forest
-------------
* numTrees:100_maxDepth:10_impurity:entropy: 0.7307692307692307
* numTrees:100_maxDepth:20_impurity:entropy: 0.7307692307692307
* numTrees:100_maxDepth:20_impurity:gini: 0.7115384615384616
* numTrees:10_maxDepth:10_impurity:entropy: 0.6923076923076923
* numTrees:10_maxDepth:20_impurity:entropy: 0.6923076923076923
* numTrees:100_maxDepth:5_impurity:entropy: 0.6923076923076923
* numTrees:100_maxDepth:10_impurity:gini: 0.6923076923076923
* numTrees:500_maxDepth:5_impurity:entropy: 0.6923076923076923
* numTrees:500_maxDepth:10_impurity:gini: 0.6923076923076923
* numTrees:500_maxDepth:10_impurity:entropy: 0.6923076923076923
* numTrees:500_maxDepth:20_impurity:gini: 0.6923076923076923
* numTrees:500_maxDepth:20_impurity:entropy: 0.6923076923076923
* numTrees:10_maxDepth:10_impurity:gini: 0.6730769230769231
* numTrees:10_maxDepth:5_impurity:entropy: 0.6538461538461539
* numTrees:10_maxDepth:20_impurity:gini: 0.6538461538461539
* numTrees:500_maxDepth:5_impurity:gin

### Decision Tree

In [22]:
def acc_calculator (ys, preds):

  truth = 0

  for indx, y in enumerate(ys):
    if y == preds[indx]:
      truth += 1
  
  return truth/len(ys)

In [23]:
parameters = {"maxBins": [10,20, 40],
              "maxDepth": [5, 10, 20],
              "impurity": ["gini", "entropy"]}

value_dict = {}
sorted_dict = {}
pbar = tqdm(range(1,19))

for maxBins in parameters["maxBins"]:
  for maxDepth in parameters["maxDepth"]:
    for impurity in parameters["impurity"]:
      pbar.set_description(f"maxBins:{maxBins}_maxDepth:{maxDepth}_impurity:{impurity}")
      pbar.update(1)
      dt = RandomForestClassifier(maxBins=maxBins, maxDepth=maxDepth, impurity=impurity)
      dtModel = dt.fit(df_train) # Fit the model
      preds = dtModel.transform(df_test) # Evaluate
      acc = acc_calculator(preds.select("label").collect(), preds.select("prediction").collect())
      value_dict[f"maxBins:{maxBins}_maxDepth:{maxDepth}_impurity:{impurity}"] = acc
  
sorted_dict = dict(sorted(value_dict.items(), key=lambda item: item[1], reverse=True))
result_dict["Desicion_Tree"] = sorted_dict

numTrees:500_maxDepth:20_impurity:entropy: 100%|██████████| 18/18 [03:25<00:00, 11.42s/it]
maxBins:40_maxDepth:20_impurity:entropy: 100%|██████████| 18/18 [00:36<00:00,  2.58s/it]

#### Evaluate

In [24]:
print("Decision Tree\n"+"-"*len("Decision Tree"))

for key in result_dict["Desicion_Tree"]:
  acc = result_dict["Desicion_Tree"][key]
  print(f"* {key}: {acc}")

Decision Tree
-------------
* maxBins:40_maxDepth:10_impurity:entropy: 0.7884615384615384
* maxBins:40_maxDepth:20_impurity:entropy: 0.7884615384615384
* maxBins:10_maxDepth:5_impurity:entropy: 0.6923076923076923
* maxBins:20_maxDepth:5_impurity:entropy: 0.6923076923076923
* maxBins:20_maxDepth:10_impurity:gini: 0.6923076923076923
* maxBins:40_maxDepth:10_impurity:gini: 0.6923076923076923
* maxBins:40_maxDepth:20_impurity:gini: 0.6923076923076923
* maxBins:10_maxDepth:10_impurity:entropy: 0.6730769230769231
* maxBins:10_maxDepth:20_impurity:entropy: 0.6730769230769231
* maxBins:20_maxDepth:10_impurity:entropy: 0.6730769230769231
* maxBins:20_maxDepth:20_impurity:gini: 0.6730769230769231
* maxBins:20_maxDepth:20_impurity:entropy: 0.6730769230769231
* maxBins:10_maxDepth:20_impurity:gini: 0.6538461538461539
* maxBins:40_maxDepth:5_impurity:entropy: 0.6538461538461539
* maxBins:10_maxDepth:10_impurity:gini: 0.6346153846153846
* maxBins:20_maxDepth:5_impurity:gini: 0.6153846153846154
* max

## Results

In [26]:
for classifier in result_dict:
  print(classifier + "\n" + "-"*len(classifier))
  for key in result_dict[classifier]:
    print("* " + key + " => " + format(result_dict[classifier][key], ".3f"))
  print("\n")

Logistic_Regression
-------------------
* maxIter:10_regParam:0.0_elasticNetParam:0.0 => 0.808
* maxIter:10_regParam:0.0_elasticNetParam:0.1 => 0.808
* maxIter:10_regParam:0.0_elasticNetParam:0.4 => 0.808
* maxIter:100_regParam:0.0_elasticNetParam:0.0 => 0.750
* maxIter:100_regParam:0.0_elasticNetParam:0.1 => 0.750
* maxIter:100_regParam:0.0_elasticNetParam:0.4 => 0.750
* maxIter:500_regParam:0.0_elasticNetParam:0.0 => 0.750
* maxIter:500_regParam:0.0_elasticNetParam:0.1 => 0.750
* maxIter:500_regParam:0.0_elasticNetParam:0.4 => 0.750
* maxIter:100_regParam:0.1_elasticNetParam:0.0 => 0.462
* maxIter:500_regParam:0.1_elasticNetParam:0.0 => 0.462
* maxIter:10_regParam:0.1_elasticNetParam:0.0 => 0.442
* maxIter:10_regParam:0.1_elasticNetParam:0.1 => 0.288
* maxIter:100_regParam:0.1_elasticNetParam:0.1 => 0.269
* maxIter:500_regParam:0.1_elasticNetParam:0.1 => 0.269
* maxIter:10_regParam:0.4_elasticNetParam:0.0 => 0.231
* maxIter:100_regParam:0.4_elasticNetParam:0.0 => 0.231
* maxIter:500_