In [1]:
import pandas as pd

import numpy as np

import io

import os

from pyspark.sql import SparkSession

from pyspark.sql.functions import col, pandas_udf, udf, PandasUDFType

from pyspark.ml import Pipeline

from pyspark.ml.functions import vector_to_array

from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.ml.feature import StandardScaler, PCA

# from pyspark import SparkContext, SparkConf



# Create spark session

In [2]:
# create a spark session
spark = (SparkSession.builder
.master('local[6]')
.appName('PCA Reduction')
.config('spark.driver.extraClassPath', 
        '/home/demo/hadoop/hadoop-3.2.2/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:/home/demo/hadoop/hadoop-3.2.2/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar:/home/demo/spark-avro_2.11:4.0.0.jar')         
.config('spark.executor.heartbeatInterval', '800000')
.config('spark.network.timeout', '900000')  
.config("spark.sql.execution.arrow.pyspark.enabled", "true")
.config("spark.sql.execution.arrow.maxRecordsPerBatch", "128")    
.getOrCreate()
        )

# Load features from local storage

In [3]:
images =(spark
         .read
         .format("parquet")
         .load('Features/Apples-by-label-Training-featured.parquet')
        )

images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)



# PCA on features
Steps :

* Transform array of features to dense vector affected to column vect_features
* Standardize with centering reduction
* Fit PCA
After standardization of featuresessai reduction de dim features -> standard scaler -> spark pca

In [4]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
images = images.withColumn('Vect_features', list_to_vector_udf(images.features))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)



In [5]:
# debut essai avec pipeline

In [6]:
# Pipeline stages
steps = []

# standardize features
scaler = StandardScaler(inputCol = 'Vect_features', 
                        outputCol = 'Scaled_features',
                        withMean = True,
                        withStd = True
                       )
steps += [scaler]

# PCA 
pca = PCA(k=2048,
          inputCol='Scaled_features',
          outputCol='PCA_features')
steps += [pca]

# pipeline 
pipeline = Pipeline(stages = steps)

# Fit the model pipeline
reduction = pipeline.fit(images)


In [7]:
VARIANCE_TO_EXPLAIN = 0.90

# Components explained variance
explained = reduction.stages[-1].explainedVariance.toArray()

# Find the components number to explain 90% of variance
finished = False
componentsNum = 0
while not finished:
    componentsNum += 1
    variance = np.sum(explained[0:componentsNum])
    finished = (variance > VARIANCE_TO_EXPLAIN)
    
print('{} principal components explains {:%} of variance'.format(componentsNum,
                                                                np.cumsum(explained[0:componentsNum])[-1]
                                                               ))

183 principal components explains 90.041401% of variance


In [8]:
 
# setting k to componentsNum and refit pipeline
params={pca.k : componentsNum}
reduction = pipeline.fit(images, params= params)

In [9]:
# Apply reduction
images = reduction.transform(images)

In [10]:
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)



In [11]:
# Transform denseVector to array

images = images.withColumn('feat_array', vector_to_array('PCA_features'))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- label: string (nullable = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)
 |-- feat_array: array (nullable = false)
 |    |-- element: double (containsNull = false)



In [12]:
# fin essai avec pipeline

# Enregistrement

In [13]:
reduction.write().overwrite().save('Features/Apples PCA reduction.model')

In [14]:
# Apply standardization and save to parquet
(images
 .select('path','label','feat_array')
 .write
 .partitionBy('label')
 .mode("overwrite")
 .parquet("Features/Apples-by-label-Training-featured-reducted.parquet")
) 

# End Spark session

In [15]:
spark.stop()