This Notebook steps :
    
   * Create a Spark session
    
   * Import Train&Test features, in Spark DataFrames
    
   * Make Pipeline with stages:
       - Encode labels
       - Multiclass logistic regression 
    
   * Classification metrics evaluation on Test
    
   * Small gridsearch for hyperparameters for accuracy improvement (just to implement it)
    
   * Best model evaluation metrics

In [1]:
print('Welcome to my EMR Notebook!')

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
0,application_1633082026362_0001,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Welcome to my EMR Notebook!

# Install dependencies

In [2]:
sc.install_pypi_package('pandas==1.2.5')

# sc.install_pypi_package('pillow')

sc.install_pypi_package('pyarrow==2')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pandas==1.2.5
  Downloading https://files.pythonhosted.org/packages/e6/0a/90da8840e044c329a0271fb0244ff40a68a2615bc360c296a3dc5e326ab6/pandas-1.2.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (9.9MB)
Collecting python-dateutil>=2.7.3 (from pandas==1.2.5)
  Downloading https://files.pythonhosted.org/packages/36/7a/87837f39d0296e723bb9b62bbb257d0355c7f6128853c78955f57342a56d/python_dateutil-2.8.2-py2.py3-none-any.whl (247kB)
Installing collected packages: python-dateutil, pandas
Successfully installed pandas-1.2.5 python-dateutil-2.8.2

Collecting pyarrow==2
  Downloading https://files.pythonhosted.org/packages/c8/58/d07e7ee8b0cffe509f9e5a3742e09636a4a58b2113d193166615b934846f/pyarrow-2.0.0-cp37-cp37m-manylinux1_x86_64.whl (16.9MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-2.0.0

# Imports

In [3]:
import pandas as pd

import numpy as np

from pyspark.sql import SparkSession

from pyspark.sql.functions import  udf, col

from pyspark.sql.types import FloatType

from pyspark.ml.linalg import  VectorUDT, Vectors

from pyspark.ml.feature import StringIndexer, IndexToString

from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression

from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Constants

TRAIN_PATH = 's3a://fruits-images-proceded/Training_featured-reducted.parquet'

TEST_PATH = 's3a://fruits-images-proceded/Test_featured-reducted.parquet'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Enable pyArrow

In [5]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load images from storage

In [6]:
train =(spark
         .read
         .format('parquet')
         .load(TRAIN_PATH)
        )

train.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)

In [7]:
# Count
train.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

67692

In [8]:
# By label count
train.groupBy('label').count().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----+
|            label|count|
+-----------------+-----+
|           Orange|  479|
|     Cantaloupe 1|  492|
|     Pear Forelle|  702|
|       Clementine|  490|
|              Fig|  702|
|         Beetroot|  450|
| Strawberry Wedge|  738|
|    Pepper Orange|  702|
|      Onion White|  438|
|Tomato Cherry Red|  492|
|  Grapefruit Pink|  490|
|    Grape White 4|  471|
|     Potato White|  450|
|      Cauliflower|  702|
|       Grape Blue|  984|
|       Nut Forest|  654|
|    Passion Fruit|  490|
|     Cantaloupe 2|  492|
|        Blueberry|  462|
|    Grape White 3|  492|
+-----------------+-----+
only showing top 20 rows

# Logistic regression

In [9]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
train = train.withColumn('Vect_features', list_to_vector_udf(train.feat_array))
train.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)

## Modelisation pipeline

In [10]:
steps = []

# from pyspark.ml.feature import StringIndexer, IndexToString
# Encode labels
label_stringIdx = StringIndexer(inputCol = 'label', outputCol = 'class')

steps += [label_stringIdx]

# from pyspark.ml.classification import LogisticRegression
# Logistic regression
lr = LogisticRegression(featuresCol='Vect_features',
                        labelCol='class',
                        )

steps += [lr]

# from pyspark.ml import Pipeline
# All steps in pipeline
pipeline = Pipeline(stages = steps)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Trainning

In [11]:
# Fit the pipeline on train
lrPipe = pipeline.fit(train)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# Print the coefficients and intercept for multinomial logistic regression
print('Coefficients: \n' + str(lrPipe.stages[-1].coefficientMatrix))
print('Intercept: ' + str(lrPipe.stages[-1].interceptVector))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Coefficients: 
DenseMatrix([[ 2.37075953e-02,  7.34178832e-03,  1.57384011e-02, ...,
              -1.41565478e-01, -8.72841168e-03,  1.11659890e-01],
             [ 1.75950736e-02,  1.73825319e-03, -8.07781900e-04, ...,
              -9.56816848e-02, -8.93335285e-02,  6.85835942e-02],
             [ 3.54862328e-02, -1.21826948e-02,  2.36510760e-02, ...,
              -2.37726157e-01,  1.37638128e-01,  6.61403368e-01],
             ...,
             [-2.36559540e-03,  3.94458288e-03,  4.48131930e-03, ...,
              -1.06266760e-01,  3.57937632e-02,  6.27939960e-02],
             [-5.88971092e-03,  2.25750883e-03, -1.73746849e-02, ...,
               1.67257937e-02,  5.82905597e-02, -1.17605250e-01],
             [-2.24923836e-02,  1.53140735e-02, -4.50896993e-04, ...,
               7.49961420e-02, -2.01823739e-02,  8.68365159e-03]])
Intercept: [0.6553418474169954,0.5615526746005938,0.3105000737886927,0.3518837899468423,0.37186790600271097,0.3763460384299922,0.3629007239904011,0.36

## Training summary

In [13]:
trainingSummary = lrPipe.stages[-1].summary

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [14]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print('objectiveHistory:')
for objective in objectiveHistory:
    print(objective)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

objectiveHistory:
4.855559138168317
1.9474869828383614
0.005001597870903989
0.0003326160400632317
0.0002571536051853591
0.000142931015945058
9.1736651740788e-05
5.537598699688579e-05
3.35075385559252e-05
2.0083079032775134e-05
1.2234786965994183e-05
7.567504046748844e-06
4.737957613271799e-06
2.9208863021715567e-06
1.6970897308023605e-06
9.718018866239258e-07
5.619177918211658e-07
3.024420168721413e-07
1.584405770507747e-07
7.433022136234104e-08
3.840740303082677e-08
1.8815143482008578e-08
9.582553930929622e-09

## Metrics per label

In [15]:
print('Precision by label:')
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print('label %d: %s' % (i, prec))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
label 5: 1.0
label 6: 1.0
label 7: 1.0
label 8: 1.0
label 9: 1.0
label 10: 1.0
label 11: 1.0
label 12: 1.0
label 13: 1.0
label 14: 1.0
label 15: 1.0
label 16: 1.0
label 17: 1.0
label 18: 1.0
label 19: 1.0
label 20: 1.0
label 21: 1.0
label 22: 1.0
label 23: 1.0
label 24: 1.0
label 25: 1.0
label 26: 1.0
label 27: 1.0
label 28: 1.0
label 29: 1.0
label 30: 1.0
label 31: 1.0
label 32: 1.0
label 33: 1.0
label 34: 1.0
label 35: 1.0
label 36: 1.0
label 37: 1.0
label 38: 1.0
label 39: 1.0
label 40: 1.0
label 41: 1.0
label 42: 1.0
label 43: 1.0
label 44: 1.0
label 45: 1.0
label 46: 1.0
label 47: 1.0
label 48: 1.0
label 49: 1.0
label 50: 1.0
label 51: 1.0
label 52: 1.0
label 53: 1.0
label 54: 1.0
label 55: 1.0
label 56: 1.0
label 57: 1.0
label 58: 1.0
label 59: 1.0
label 60: 1.0
label 61: 1.0
label 62: 1.0
label 63: 1.0
label 64: 1.0
label 65: 1.0
label 66: 1.0
label 67: 1.0
label 68: 1.0
label 69: 1.0
label 70: 

In [16]:
print('Recall by label:')
for i, rec in enumerate(trainingSummary.recallByLabel):
    print('label %d: %s' % (i, rec))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
label 5: 1.0
label 6: 1.0
label 7: 1.0
label 8: 1.0
label 9: 1.0
label 10: 1.0
label 11: 1.0
label 12: 1.0
label 13: 1.0
label 14: 1.0
label 15: 1.0
label 16: 1.0
label 17: 1.0
label 18: 1.0
label 19: 1.0
label 20: 1.0
label 21: 1.0
label 22: 1.0
label 23: 1.0
label 24: 1.0
label 25: 1.0
label 26: 1.0
label 27: 1.0
label 28: 1.0
label 29: 1.0
label 30: 1.0
label 31: 1.0
label 32: 1.0
label 33: 1.0
label 34: 1.0
label 35: 1.0
label 36: 1.0
label 37: 1.0
label 38: 1.0
label 39: 1.0
label 40: 1.0
label 41: 1.0
label 42: 1.0
label 43: 1.0
label 44: 1.0
label 45: 1.0
label 46: 1.0
label 47: 1.0
label 48: 1.0
label 49: 1.0
label 50: 1.0
label 51: 1.0
label 52: 1.0
label 53: 1.0
label 54: 1.0
label 55: 1.0
label 56: 1.0
label 57: 1.0
label 58: 1.0
label 59: 1.0
label 60: 1.0
label 61: 1.0
label 62: 1.0
label 63: 1.0
label 64: 1.0
label 65: 1.0
label 66: 1.0
label 67: 1.0
label 68: 1.0
label 69: 1.0
label 70: 1.0

In [17]:
print('F-measure by label:')
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print('label %d: %s' % (i, f))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

F-measure by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0
label 5: 1.0
label 6: 1.0
label 7: 1.0
label 8: 1.0
label 9: 1.0
label 10: 1.0
label 11: 1.0
label 12: 1.0
label 13: 1.0
label 14: 1.0
label 15: 1.0
label 16: 1.0
label 17: 1.0
label 18: 1.0
label 19: 1.0
label 20: 1.0
label 21: 1.0
label 22: 1.0
label 23: 1.0
label 24: 1.0
label 25: 1.0
label 26: 1.0
label 27: 1.0
label 28: 1.0
label 29: 1.0
label 30: 1.0
label 31: 1.0
label 32: 1.0
label 33: 1.0
label 34: 1.0
label 35: 1.0
label 36: 1.0
label 37: 1.0
label 38: 1.0
label 39: 1.0
label 40: 1.0
label 41: 1.0
label 42: 1.0
label 43: 1.0
label 44: 1.0
label 45: 1.0
label 46: 1.0
label 47: 1.0
label 48: 1.0
label 49: 1.0
label 50: 1.0
label 51: 1.0
label 52: 1.0
label 53: 1.0
label 54: 1.0
label 55: 1.0
label 56: 1.0
label 57: 1.0
label 58: 1.0
label 59: 1.0
label 60: 1.0
label 61: 1.0
label 62: 1.0
label 63: 1.0
label 64: 1.0
label 65: 1.0
label 66: 1.0
label 67: 1.0
label 68: 1.0
label 69: 1.0
label 70: 

In [18]:
accuracy = trainingSummary.accuracy
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print('Accuracy: %s\nF-measure: %s\nPrecision: %s\nRecall: %s'
      % (accuracy, fMeasure, precision, recall))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Accuracy: 1.0
F-measure: 1.0000000000000004
Precision: 1.0000000000000004
Recall: 1.0000000000000004

# Validation

In [19]:
test =(spark
         .read
         .format('parquet')
         .load(TEST_PATH)
        )

test.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)

In [20]:
# Count
test.count()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

22688

In [21]:
# By label count
test.groupBy('label').count().show(13)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+-----+
|            label|count|
+-----------------+-----+
|     Pear Forelle|  234|
|     Cantaloupe 1|  164|
|       Clementine|  166|
|           Orange|  160|
| Strawberry Wedge|  246|
|      Onion White|  146|
|    Pepper Orange|  234|
|              Fig|  234|
|  Grapefruit Pink|  166|
|    Grape White 4|  158|
|Tomato Cherry Red|  164|
|     Potato White|  150|
|         Beetroot|  150|
+-----------------+-----+
only showing top 13 rows

In [22]:
# Create new column with vectors
test = test.withColumn('Vect_features', list_to_vector_udf(test.feat_array))
test.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)

## Predictions

In [23]:
# On test set
test_predictions = lrPipe.transform(test)

# pred_label from predicition
converter = IndexToString(inputCol='prediction',
                          outputCol='pred_label',
                          labels=lrPipe.stages[0].labels)
test_predictions = converter.transform(test_predictions)

# predictions
test_predictions.select('label',
                   'class',

                   'prediction',
                   'pred_label',
                   'probability'
                  ).show(10)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------+-----+----------+----------+--------------------+
|     label|class|prediction|pred_label|         probability|
+----------+-----+----------+----------+--------------------+
|Watermelon| 95.0|      95.0|Watermelon|[8.98958900046451...|
|Watermelon| 95.0|      95.0|Watermelon|[7.03153566558319...|
|Watermelon| 95.0|      95.0|Watermelon|[3.99050338439641...|
|Watermelon| 95.0|      95.0|Watermelon|[3.56994543639699...|
|Watermelon| 95.0|      95.0|Watermelon|[3.34264848288672...|
|Watermelon| 95.0|      95.0|Watermelon|[1.30859688361812...|
|Watermelon| 95.0|      95.0|Watermelon|[9.33531278039963...|
|Watermelon| 95.0|      95.0|Watermelon|[5.30600427278219...|
|Watermelon| 95.0|      95.0|Watermelon|[1.43123574663425...|
|Watermelon| 95.0|      95.0|Watermelon|[1.14767811578168...|
+----------+-----+----------+----------+--------------------+
only showing top 10 rows

In [24]:
test_predictions.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- class: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- pred_label: string (nullable = true)

## Evaluation on validation set

In [25]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='class',
                                              probabilityCol='probability'
                                             )

metrique=['f1',
          'fMeasureByLabel',
          'weightedFMeasure',
          'accuracy',
          'precisionByLabel',
          'weightedPrecision',
          'recallByLabel',
          'weightedRecall',
          
          'logLoss',
          'hammingLoss']

for met in metrique:
    print('Validation '+met+' score :', evaluator.setMetricName(met).evaluate(test_predictions))


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Validation f1 score : 0.9933592031893852
Validation fMeasureByLabel score : 1.0
Validation weightedFMeasure score : 0.9933592031893852
Validation accuracy score : 0.9934767277856136
Validation precisionByLabel score : 1.0
Validation weightedPrecision score : 0.9939146596956664
Validation recallByLabel score : 1.0
Validation weightedRecall score : 0.9934767277856131
Validation logLoss score : 0.02286049629938294
Validation hammingLoss score : 0.00652327221438646

# End Spark session

In [26]:
spark.stop()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…