This Notebook steps :
    
   * Create a Spark session
    
   * Import training images featured, in a Spark DataFrame
    
   * Project features on 2048 PCA dim 
    
   * Retrive number of significant eigenvalues
    
   * Apply PCA reduction dimension
    
   * Stores path, label and PCA reducted feature array partitionned by label in parquet format file on S3 

In [1]:
print('Welcome to my EMR Notebook!')

VBox()

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
7,application_1631354174370_0008,pyspark,idle,Link,Link,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Welcome to my EMR Notebook!

# Install dependencies

In [2]:
sc.install_pypi_package('pyarrow==2')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Collecting pyarrow==2
  Using cached https://files.pythonhosted.org/packages/c8/58/d07e7ee8b0cffe509f9e5a3742e09636a4a58b2113d193166615b934846f/pyarrow-2.0.0-cp37-cp37m-manylinux1_x86_64.whl
Installing collected packages: pyarrow
Successfully installed pyarrow-2.0.0

# Imports

In [3]:
# import pandas as pd

import numpy as np

# import io

# import os

from pyspark.sql import SparkSession

from pyspark.sql.functions import  udf
# col, pandas_udf,, PandasUDFType
from pyspark.ml import Pipeline

from pyspark.ml.functions import vector_to_array

from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.ml.feature import StandardScaler, PCA

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# Constants

LOAD_PATH = 's3a://fruits-images-proceded/Training_featured.parquet'

SAVE_PATH = 's3a://fruits-images-proceded/Training_featured-reducted.parquet'

MODEL_PATH = 's3a://pca-reduction-model/PCA reduction.model'

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Enable pyArrow

In [5]:
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', 'true')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Load features from storage

In [6]:
images =(spark
         .read
         .format('parquet')
         .load(LOAD_PATH)
        )

images.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)

# PCA on features
Steps :

* Transform array of features to dense vector affected to column vect_features
* Create Pipeline with stages :
    - Standardize with centering reduction
    - PCA
* Fit PCA
* Set dimension 
* Apply dimension reduction


In [7]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
images = images.withColumn('Vect_features', list_to_vector_udf(images.features))
images.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)

In [8]:
# Pipeline stages
steps = []

# standardize features
scaler = StandardScaler(inputCol = 'Vect_features', 
                        outputCol = 'Scaled_features',
                        withMean = True,
                        withStd = True
                       )
steps += [scaler]

# PCA 
pca = PCA(k=2048,
          inputCol='Scaled_features',
          outputCol='PCA_features')
steps += [pca]

# pipeline 
pipeline = Pipeline(stages = steps)

# Fit the model pipeline
reduction = pipeline.fit(images)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
VARIANCE_TO_EXPLAIN = 0.99

# Components explained variance
explained = reduction.stages[-1].explainedVariance.toArray()

# Find the components number to explain 97% of variance
finished = False
componentsNum = 0
while not finished:
    componentsNum += 1
    variance = np.sum(explained[0:componentsNum])
    finished = (variance > VARIANCE_TO_EXPLAIN)
    
print('{} principal components explains {:%} of variance'.format(componentsNum,
                                                                np.cumsum(explained[0:componentsNum])[-1]
                                                               ))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1468 principal components explains 99.001760% of variance

In [10]:
# setting k to componentsNum and refit pipeline
params={pca.k : componentsNum}
reduction = pipeline.fit(images, params= params)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Apply reduction
images = reduction.transform(images)
images.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)

In [13]:
# Transform denseVector to array
images = images.withColumn('feat_array', vector_to_array('PCA_features'))
images.printSchema()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)
 |-- feat_array: array (nullable = false)
 |    |-- element: double (containsNull = false)

# Save to storage

In [14]:
# Save reduction pipeline 
reduction.write().overwrite().save(MODEL_PATH)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
# Apply standardization and save to parquet
(images
 .select('path','label','feat_array')
 .write
 .partitionBy('label')
 .mode('overwrite')
 .parquet(SAVE_PATH)
) 

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# End Spark session

In [16]:
spark.stop()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…