In [1]:
import pandas as pd

import numpy as np

from pyspark.sql import SparkSession

from pyspark.sql.functions import  udf, col

from pyspark.sql.types import FloatType

from pyspark.ml.linalg import  VectorUDT, Vectors

from pyspark.ml.feature import StringIndexer, IndexToString

from pyspark.ml import Pipeline

from pyspark.ml.classification import LogisticRegression

from pyspark.mllib.evaluation import MulticlassMetrics

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create spark session

In [2]:
# create a spark session
spark = (SparkSession.builder
.master('local[6]')
.appName('logistic regression on apples')
.config('spark.driver.extraClassPath', 
        '/home/demo/hadoop/hadoop-3.2.2/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:/home/demo/hadoop/hadoop-3.2.2/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar:/home/demo/spark-avro_2.11:4.0.0.jar')         
.config('spark.executor.heartbeatInterval', '800000')
.config('spark.network.timeout', '900000')  
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.sql.execution.arrow.maxRecordsPerBatch", "128") 
.getOrCreate()
        )

# Load images from local storage

In [3]:
train =(spark
         .read
         .format("parquet")
         .load('Features/Apples-by-label-Training-featured-reducted.parquet')
        )

train.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)



In [4]:
# Count
train.count()

1700

In [5]:
# By label count
ordered = train.groupBy('label').count().sort('label').toPandas()
ordered

Unnamed: 0,label,count
0,Apple Braeburn,248
1,Apple Golden 3,481
2,Apple Pink Lady,231
3,Apple Red 1,248
4,Apple Red 2,492


# Logistic regression

In [6]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
train = train.withColumn('Vect_features', list_to_vector_udf(train.feat_array))
train.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



In [7]:
steps = []

In [8]:
# from pyspark.ml.feature import StringIndexer, IndexToString
# Encode labels
label_stringIdx = StringIndexer(inputCol = 'label',
                                outputCol = 'class',
                                stringOrderType='alphabetAsc'
                               )

steps += [label_stringIdx]

In [9]:
# from pyspark.ml.classification import LogisticRegression
# Logistic regression
lr = LogisticRegression(featuresCol='Vect_features',
                        labelCol='class',
                        maxIter=20
                        )

steps += [lr]

In [10]:
# from pyspark.ml import Pipeline
# All steps in pipeline
pipeline = Pipeline(stages = steps)

In [11]:
# Fit the pipeline on train
lrPipe = pipeline.fit(train)

In [12]:
# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrPipe.stages[-1].coefficientMatrix))
print("Intercept: " + str(lrPipe.stages[-1].interceptVector))

Coefficients: 
DenseMatrix([[-6.99298100e-02, -6.66442140e-02, -2.93547331e-01,
               1.10329031e-01, -1.06608122e-01,  3.30844834e-01,
              -8.68863340e-02,  3.58525421e-02,  2.21125569e-01,
              -5.51610883e-02, -1.75373238e-01, -4.08275616e-02,
               4.16554186e-02,  4.81454232e-02,  2.77941558e-01,
               5.64607822e-02,  2.48964437e-01,  8.22557739e-02,
               4.92542426e-02, -1.19588812e-01, -8.96046484e-02,
               7.13879675e-02, -1.24718363e-01, -1.39793682e-01,
               1.03274998e-01, -7.92401988e-03,  3.50987877e-03,
               2.69617432e-01, -1.53214722e-01,  1.45829297e-01,
               2.05753310e-01, -1.08970912e-01,  9.13608703e-02,
               2.46368730e-02,  2.09903566e-02,  1.15326254e-01,
               1.42633097e-01, -5.25548146e-02, -1.21507435e-01,
              -1.31824633e-01,  3.23808749e-02,  2.09810542e-01,
               2.53651737e-01, -1.12291432e-01, -4.35100310e-02,
          

# Training summary

In [13]:
trainingSummary = lrPipe.stages[-1].summary

In [14]:
# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

objectiveHistory:
1.5489118003124742
0.7989360812799258
0.0925721861307303
0.05671881299356421
0.025687848908132003
0.013126290536260257
0.006477591259785401
0.0032597288023079214
0.001633113030684423
0.0008208090556118725
0.00041231359144483254
0.00020721454942169886
0.00010412404063700313
5.2320431250256236e-05
2.6286480930156422e-05
1.3204941479367663e-05
6.632436056645181e-06
3.33073753510588e-06
1.6723836286250893e-06
8.395721748316272e-07
4.2140913573116816e-07


# Metrics per label

In [15]:
print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

Precision by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [16]:
print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

Recall by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [17]:
print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

F-measure by label:
label 0: 1.0
label 1: 1.0
label 2: 1.0
label 3: 1.0
label 4: 1.0


In [18]:
accuracy = trainingSummary.accuracy
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, fMeasure, precision, recall))

Accuracy: 1.0
F-measure: 1.0
Precision: 1.0
Recall: 1.0


# Validation

In [19]:
test =(spark
         .read
         .format("parquet")
         .load('Features/Apples-by-label-Test-featured-reducted.parquet')
        )

test.printSchema()


root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)



In [20]:
# Count
test.count()

805

In [21]:
# By label count
test.groupBy('label').count().sort('label').show()

+---------------+-----+
|          label|count|
+---------------+-----+
| Apple Braeburn|  164|
| Apple Golden 3|  161|
|Apple Pink Lady|  152|
|    Apple Red 1|  164|
|    Apple Red 2|  164|
+---------------+-----+



In [22]:
# Create new column with vectors
test = test.withColumn('Vect_features', list_to_vector_udf(test.feat_array))
test.printSchema()


root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



## Predictions

In [23]:
# On test set
test_predictions = lrPipe.transform(test)

# pred_label from predicition
converter = IndexToString(inputCol="prediction",
                          outputCol="pred_label",
                          labels=lrPipe.stages[0].labels)
test_predictions = converter.transform(test_predictions)

# predictions
test_predictions.select('label',
                   'class',

                   'prediction',
                   'pred_label',
                   'probability'
                  ).show(10)

+--------------+-----+----------+--------------+--------------------+
|         label|class|prediction|    pred_label|         probability|
+--------------+-----+----------+--------------+--------------------+
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[6.96625499761634...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[5.69428771506003...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[6.86177142530016...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[2.19793788967867...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[2.66647003472617...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[9.48151258924894...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[7.96286517516796...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[9.05077921996437...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[5.09785454281415...|
|Apple Golden 3|  1.0|       1.0|Apple Golden 3|[1.22047282387532...|
+--------------+-----+----------+--------------+--------------------+
only showing top 10 

In [24]:
test_predictions.printSchema()

root
 |-- path: string (nullable = true)
 |-- feat_array: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- class: double (nullable = false)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- pred_label: string (nullable = true)



## Confusion matrix

In [25]:
#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = test_predictions.select(['prediction','class']).withColumn('Class', col('class').cast(FloatType())).orderBy('class')

#select only prediction and label columns
preds_and_labels = preds_and_labels.select(['prediction','Class'])

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

# Confusion matrix labels
pred = ordered.label.copy()
pred.name = 'predicted'

ini = ordered.label.copy()
ini.name = 'original'
# Matrix
cm = pd.DataFrame(metrics.confusionMatrix().toArray(),
                 columns=pred,
                 index=ini
                )
print('Confusion matrix on validation')
display( cm)

Confusion matrix on validation


predicted,Apple Braeburn,Apple Golden 3,Apple Pink Lady,Apple Red 1,Apple Red 2
original,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apple Braeburn,164.0,0.0,0.0,0.0,0.0
Apple Golden 3,0.0,161.0,0.0,0.0,0.0
Apple Pink Lady,0.0,0.0,152.0,0.0,0.0
Apple Red 1,0.0,0.0,0.0,154.0,10.0
Apple Red 2,12.0,0.0,0.0,0.0,152.0


## Evaluation on validation set

In [26]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                              labelCol='class',
                                              probabilityCol='probability'
                                             )

metrique=['f1',
          'fMeasureByLabel',
          'weightedFMeasure',
          'accuracy',
          'precisionByLabel',
          'weightedPrecision',
          'recallByLabel',
          'weightedRecall',
          
          'logLoss',
          'hammingLoss']

for met in metrique:
    print('Validation '+met+' score :', evaluator.setMetricName(met).evaluate(test_predictions))


Validation f1 score : 0.9726547171911839
Validation fMeasureByLabel score : 0.9647058823529412
Validation weightedFMeasure score : 0.9726547171911839
Validation accuracy score : 0.9726708074534162
Validation precisionByLabel score : 0.9318181818181818
Validation weightedPrecision score : 0.9735338199106315
Validation recallByLabel score : 1.0
Validation weightedRecall score : 0.972670807453416
Validation logLoss score : 0.05751444207664989
Validation hammingLoss score : 0.02732919254658385


# End Spark session

In [27]:
spark.stop()