This Notebook steps :
    
   * Create a Spark session
    
   * Import Train&Test features, in Spark DataFrames
    
   * Make Pipeline with stages:
       - Encode labels
       - Multiclass logistic regression 
    
   * Classification metrics evaluation on Test
    
   * Small gridsearch for hyperparameters for accuracy improvement (just to implement it)
    
   * Best model evaluation metrics

In [1]:
import pandas as pd

import numpy as np

from pyspark.sql import SparkSession

from pyspark.sql.functions import  udf, col

from pyspark.sql.types import FloatType

from pyspark.ml.linalg import  VectorUDT, Vectors

from pyspark.ml.feature import StringIndexer, IndexToString

from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression

from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [2]:
# Constants

WORKERS = 'local[2]'

TRAIN_PATH = 's3a://fruits-images-proceded/Training_apples_featured-reducted.parquet'

TEST_PATH = 's3a://fruits-images-proceded/Test_apples_featured-reducted.parquet'

# Create spark session

In [3]:
# create a spark session
spark = (SparkSession.builder
.master(WORKERS)
.appName('logistic regression on apples')
.config('spark.driver.extraClassPath', 
        '/home/ec2-user/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:/home/ec2-user/hadoop/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar')         
.config('spark.executor.heartbeatInterval', '300000')
.config('spark.network.timeout', '900000')
.config('spark.sql.execution.arrow.pyspark.enabled', 'true')
.config('spark.sql.execution.arrow.maxRecordsPerBatch', '128')
.getOrCreate()
        )

# Load images from local storage

In [4]:
train =(spark
         .read
         .format('parquet')
         .load(TRAIN_PATH)
        )

train.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)



In [5]:
# Count
train.count()

1700

In [6]:
# By label count
train.groupBy('label').count().show()

+---------------+-----+
|          label|count|
+---------------+-----+
| Apple Golden 3|  481|
|    Apple Red 2|  492|
|Apple Pink Lady|  231|
|    Apple Red 1|  248|
| Apple Braeburn|  248|
+---------------+-----+



# Logistic regression

In [7]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
train = train.withColumn('Vect_features', list_to_vector_udf(train.feat_array))
train.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



## Modelisation pipeline

In [8]:
steps = []

# from pyspark.ml.feature import StringIndexer, IndexToString
# Encode labels
label_stringIdx = StringIndexer(inputCol = 'label', outputCol = 'class')

steps += [label_stringIdx]

# from pyspark.ml.classification import LogisticRegression
# Logistic regression
lr = LogisticRegression(featuresCol='Vect_features',
                        labelCol='class',
                        )

steps += [lr]

# from pyspark.ml import Pipeline
# All steps in pipeline
pipeline = Pipeline(stages = steps)

## Trainning

In [9]:
# Fit the pipeline on train
lrPipe = pipeline.fit(train)

In [10]:
# Print the coefficients and intercept for multinomial logistic regression
print('Coefficients: \n' + str(lrPipe.stages[-1].coefficientMatrix))
print('Intercept: ' + str(lrPipe.stages[-1].interceptVector))

Coefficients: 
DenseMatrix([[-1.81009056e-01, -3.14266310e-02,  2.26220846e-01,
              -2.02279118e-01, -5.32867148e-01, -2.29359172e-01,
               6.59831468e-01,  4.00758021e-01,  1.97440141e-01,
               7.97378255e-02,  3.17976531e-01, -3.31628012e-02,
               1.08452781e-01,  2.40030736e-01, -1.97401267e-01,
              -2.83989119e-01,  1.17173271e-02, -3.24029628e-03,
              -1.47800171e-01,  5.61610528e-02,  3.02352806e-01,
               1.87263082e-01,  2.29987990e-02,  1.12217101e-02,
               5.97579886e-02, -1.38422198e-01,  1.07891272e-01,
              -1.58479835e-01,  2.10507010e-02,  1.60911960e-01,
              -2.38010607e-01,  1.00070063e-01, -6.33937575e-02,
               1.49867966e-01,  2.27913656e-01, -1.42663934e-01,
              -2.30617086e-01,  2.12112814e-01,  1.48943687e-01,
               7.58671263e-02, -7.01796065e-03, -1.37026883e-01,
              -3.04353914e-01, -2.68937743e-01, -3.73756733e-02,
          

## Training summary

In [11]:
trainingSummary = lrPipe.stages[-1].summary

In [12]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print('objectiveHistory:')
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
1.548911800312474
0.7989360816426184
0.0925721868905535
0.05671881344710751
0.025687849140910133
0.013126290660154777
0.0064775913247159065
0.003259728836620947
0.001633113048675623
0.0008208090650036979
0.00041231359631684696
0.00020721455193470096
0.00010412404192668625
5.232043190899936e-05
2.62864812653675e-05
1.3204941649360833e-05
6.632436142645931e-06
3.3307375785012325e-06
1.6723836505160752e-06
8.395721858878811e-07
4.214091413872973e-07
2.114774268267267e-07
1.0609642184617234e-07
5.319557949481096e-08
2.6613750312562575e-08
1.322289181155825e-08
6.387105544589264e-09


## Metrics per label

In [13]:
print('Precision by label:')
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print('label %d: %s' % (i, prec))

Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [14]:
print('Recall by label:')
for i, rec in enumerate(trainingSummary.recallByLabel):
    print('label %d: %s' % (i, rec))

Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [15]:
print('F-measure by label:')
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print('label %d: %s' % (i, f))

F-measure by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [16]:
accuracy = trainingSummary.accuracy
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print('Accuracy: %s\nF-measure: %s\nPrecision: %s\nRecall: %s'
      % (accuracy, fMeasure, precision, recall))

Accuracy: 1.0
F-measure: 1.0
Precision: 1.0
Recall: 1.0


# Validation

In [17]:
test =(spark
         .read
         .format('parquet')
         .load(TEST_PATH)
        )

test.printSchema()


root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)



In [18]:
# Count
test.count()

805

In [19]:
# By label count
test.groupBy('label').count().show(13)

+---------------+-----+
|          label|count|
+---------------+-----+
| Apple Golden 3|  161|
|    Apple Red 2|  164|
|Apple Pink Lady|  152|
|    Apple Red 1|  164|
| Apple Braeburn|  164|
+---------------+-----+



In [20]:
# By label count
ordered = train.groupBy('label').count().sort('label').toPandas()
ordered

Unnamed: 0,label,count
0,Apple Braeburn,248
1,Apple Golden 3,481
2,Apple Pink Lady,231
3,Apple Red 1,248
4,Apple Red 2,492


In [21]:
# Create new column with vectors
test = test.withColumn('Vect_features', list_to_vector_udf(test.feat_array))
test.printSchema()


root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



## Predictions

In [22]:
# On test set
test_predictions = lrPipe.transform(test)

# pred_label from predicition
converter = IndexToString(inputCol='prediction',
                          outputCol='pred_label',
                          labels=lrPipe.stages[0].labels)
test_predictions = converter.transform(test_predictions)

# predictions
test_predictions.select('label',
                   'class',

                   'prediction',
                   'pred_label',
                   'probability'
                  ).show(10)

+--------------+-----+----------+--------------+--------------------+
|         label|class|prediction|    pred_label|         probability|
+--------------+-----+----------+--------------+--------------------+
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[4.31721350320844...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[1.36215563770017...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[3.09540740698309...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[2.81268013075582...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[3.85218851272382...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[2.48170258474959...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[2.99652373020313...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[5.32428325447978...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[3.81978237892726...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[9.08552858194626...|
+--------------+-----+----------+--------------+--------------------+
only showing top 10 

In [23]:
test_predictions.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- class: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- pred_label: string (nullable = true)



## Confusion matrix

In [24]:
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = test_predictions.select(['prediction','class']).withColumn('Class', col('class').cast(FloatType())).orderBy('class')

#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','Class'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Confusion matrix labels
pred = ordered.label.copy()
pred.name = 'predicted'

ini = ordered.label.copy()
ini.name = 'original'
# Matrix
cm = pd.DataFrame(metrics.confusionMatrix().toArray(),
                 columns=pred,
                 index=ini
                )
print('Confusion matrix on validation')
display( cm)

Confusion matrix on validation


predicted,Apple Braeburn,Apple Golden 3,Apple Pink Lady,Apple Red 1,Apple Red 2
original,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apple Braeburn,152.0,0.0,12.0,0.0,0.0
Apple Golden 3,0.0,161.0,0.0,0.0,0.0
Apple Pink Lady,0.0,0.0,164.0,0.0,0.0
Apple Red 1,11.0,0.0,0.0,153.0,0.0
Apple Red 2,0.0,0.0,0.0,0.0,152.0


## Evaluation on validation set

In [25]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='class',
                                              probabilityCol='probability'
                                             )

metrique=['f1',
          'fMeasureByLabel',
          'weightedFMeasure',
          'accuracy',
          'precisionByLabel',
          'weightedPrecision',
          'recallByLabel',
          'weightedRecall',
          
          'logLoss',
          'hammingLoss']

for met in metrique:
    print('Validation '+met+' score :', evaluator.setMetricName(met).evaluate(test_predictions))


Validation f1 score : 0.971410863074718
Validation fMeasureByLabel score : 0.9296636085626913
Validation weightedFMeasure score : 0.971410863074718
Validation accuracy score : 0.9714285714285714
Validation precisionByLabel score : 0.9325153374233128
Validation weightedPrecision score : 0.9723611144790125
Validation recallByLabel score : 0.926829268292683
Validation weightedRecall score : 0.9714285714285715
Validation logLoss score : 0.06136007624312485
Validation hammingLoss score : 0.02857142857142857


# End Spark session

In [26]:
spark.stop()