This Notebook steps :
    
   * Create a Spark session
    
   * Import training images featured, in a Spark DataFrame
    
   * Project features on 2048 PCA dim 
    
   * Retrive number of significant eigenvalues
    
   * Apply PCA reduction dimension
    
   * Stores path, label and PCA reducted feature array partitionned by label in parquet format file on S3 

In [1]:
# import pandas as pd

import numpy as np

# import io

# import os

from pyspark.sql import SparkSession

from pyspark.sql.functions import  udf
# col, pandas_udf,, PandasUDFType
from pyspark.ml import Pipeline

from pyspark.ml.functions import vector_to_array

from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.ml.feature import StandardScaler, PCA

In [2]:
# Constants

WORKERS = 'local[2]'

LOAD_PATH = 's3a://fruits-images-proceded/Training_apples_featured.parquet'

SAVE_PATH = 's3a://fruits-images-proceded/Training_apples_featured-reducted.parquet'

MODEL_PATH = 's3a://pca-reduction-model/Apples PCA reduction.model'

# Create spark session

In [3]:
# create a spark session
spark = (SparkSession.builder
.master(WORKERS)
.appName('PCA Reduction')
.config('spark.driver.extraClassPath', 
        '/home/ec2-user/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:/home/ec2-user/hadoop/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar')         
.config('spark.executor.heartbeatInterval', '300000')
.config('spark.network.timeout', '900000')
.config('spark.sql.execution.arrow.pyspark.enabled', 'true')
.config('spark.sql.execution.arrow.maxRecordsPerBatch', '128')
.getOrCreate()
        )

# Load features from local storage

In [4]:
images =(spark
         .read
         .format('parquet')
         .load(LOAD_PATH)
        )

images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)



# PCA on features
Steps :

* Transform array of features to dense vector affected to column vect_features
* Create Pipeline with stages :
    - Standardize with centering reduction
    - PCA
* Fit PCA
* Set dimension 
* Apply dimension reduction


In [5]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
images = images.withColumn('Vect_features', list_to_vector_udf(images.features))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



In [6]:
# Pipeline stages
steps = []

# standardize features
scaler = StandardScaler(inputCol = 'Vect_features', 
                        outputCol = 'Scaled_features',
                        withMean = True,
                        withStd = True
                       )
steps += [scaler]

# PCA 
pca = PCA(k=2048,
          inputCol='Scaled_features',
          outputCol='PCA_features')
steps += [pca]

# pipeline 
pipeline = Pipeline(stages = steps)

# Fit the model pipeline
reduction = pipeline.fit(images)

In [7]:
VARIANCE_TO_EXPLAIN = 0.90

# Components explained variance
explained = reduction.stages[-1].explainedVariance.toArray()

# Find the components number to explain 90% of total variance
finished = False
componentsNum = 0
while not finished:
    componentsNum += 1
    variance = np.sum(explained[0:componentsNum])
    finished = (variance > VARIANCE_TO_EXPLAIN)
    
print('{} principal components explains {:%} of variance'.format(componentsNum,
                                                                np.cumsum(explained[0:componentsNum])[-1]
                                                               ))

183 principal components explains 90.041399% of variance


In [8]:
# setting k to componentsNum and refit pipeline
params={pca.k : componentsNum}
reduction = pipeline.fit(images, params= params)

In [9]:
# Apply reduction
images = reduction.transform(images)
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)



In [10]:
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)



In [11]:
# Transform denseVector to array
images = images.withColumn('feat_array', vector_to_array('PCA_features'))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)
 |-- feat_array: array (nullable = false)
 |    |-- element: double (containsNull = false)



# Enregistrement

In [12]:
# Save reduction pipeline 
reduction.write().overwrite().save(MODEL_PATH)

In [13]:
# Apply standardization and save to parquet, partitionned by label
(images
 .select('path','label','feat_array')
 .write
 .partitionBy('label')
 .mode('overwrite')
 .parquet(SAVE_PATH)
) 

# End Spark session

In [14]:
spark.stop()