This Notebook steps :
    
   * Create a Spark session
    
   * Import training images to proceed, as binary, in a Spark DataFrame
    
   * Labeling images, by fruit name, extracted from images path 
    
   * Enhance image by tweeking color, sharpness, contrast, brightness
    
   * Extract 2048 features array by tranfert learning, using Keras Resnet50 CNN
    
   * Apply PCA reduction dimension
    
   * Stores path, label and PCA reducted feature array partitionned by label in parquet format file on S3 

In [1]:
import pandas as pd

from PIL import Image

from PIL import ImageEnhance

import numpy as np

import io

import os

from pyspark.sql import SparkSession

from pyspark.sql.functions import col, pandas_udf, udf, PandasUDFType, size

from pyspark import SparkContext, SparkConf

import tensorflow as tf

from tensorflow.keras.models import Model

from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

from tensorflow.keras.preprocessing.image import img_to_array

from pyspark.ml import PipelineModel

from pyspark.ml.functions import vector_to_array

from pyspark.ml.linalg import Vectors, VectorUDT

from pyspark.ml.feature import StandardScaler, PCA

In [2]:
# Constants

WORKERS = 'local[2]'

LOAD_PATH = 's3a://fruits-images-to-proceed/Test_apples/'

SAVE_PATH = 's3a://fruits-images-proceded/Test_apples_featured-reducted.parquet'

MODEL_PATH = 's3a://pca-reduction-model/Apples PCA reduction.model'

# Create spark session

In [3]:
# create a spark session
spark = (SparkSession.builder
.master(WORKERS)
.appName('Test featuring on apples')
.config('spark.driver.extraClassPath', 
        '/home/ec2-user/hadoop/share/hadoop/tools/lib/aws-java-sdk-bundle-1.11.375.jar:/home/ec2-user/hadoop/share/hadoop/tools/lib/hadoop-aws-3.2.0.jar')         
.config('spark.executor.heartbeatInterval', '300000')
.config('spark.network.timeout', '900000') 
.config('spark.sql.execution.arrow.pyspark.enabled', 'true')
.config('spark.sql.execution.arrow.maxRecordsPerBatch', '128')
.getOrCreate()
        )

# Load images from local storage

In [4]:
%%time

images =(spark
         .read
         .format('binaryFile')
         .option('pathGlobFilter', '*.jpg')
         .option('recursiveFileLookup', 'true')
         .load(LOAD_PATH)
        )

images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 3.4 s


In [5]:
# Total number of images
totalMunber = images.count()
print('Total number of images in train set {}'.format(totalMunber))

Total number of images in train set 805


## Retrieve labels from image path

In [6]:
# Offset of starting image name
path_offset = len(LOAD_PATH)

In [7]:
#Get only fruit name from path
from pyspark.sql.functions import udf
from pyspark.sql import types 

col_label = udf(lambda s : extract_label(s), types.StringType())

def extract_label(s):
    last = s[path_offset :]
    return last[:last.rfind('/')]

In [8]:
images = images.withColumn('label',col_label(images.path))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)



In [9]:
#Get only fruit name from path
images.select('label').show(truncate=False)

+--------------+
|label         |
+--------------+
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Braeburn|
|Apple Red 1   |
|Apple Braeburn|
|Apple Red 1   |
|Apple Braeburn|
|Apple Braeburn|
+--------------+
only showing top 20 rows



In [10]:
# By label count
print('By label images count :')
images.groupBy('label').count().show()

By label images count :
+---------------+-----+
|          label|count|
+---------------+-----+
| Apple Golden 3|  161|
|    Apple Red 2|  164|
|Apple Pink Lady|  152|
|    Apple Red 1|  164|
| Apple Braeburn|  164|
+---------------+-----+



# Images enhancement

In [11]:
# Enhance image
def enhance(img,
            color = 1.25,
            sharpness = 4.5,
            contrast = 1.25,
            brigthness= 1.5):
    colorEnhancer = ImageEnhance.Color(img)
    img = colorEnhancer.enhance(color)
    
    sharpnessEnhancer = ImageEnhance.Sharpness(img)
    sharpnessEnhancer.enhance(sharpness)
    
    contrastEnhancer = ImageEnhance.Contrast(img)
    contrastEnhancer.enhance(contrast)
    
    brigthnessEnhancer = ImageEnhance.Brightness(img)
    brigthnessEnhancer.enhance(brigthness)
    
    return img

# Transfert learning (Resnet50)

In [12]:
def model_fn():
    '''
    Returns a ResNet50 model with top layer removed and broadcasted pretrained weights.
    '''
    
    resnet_full = ResNet50()

    resnet = Model(inputs = resnet_full.inputs,
                   outputs = resnet_full.layers[-2].output)
    return resnet

In [13]:
def preprocess(content):
    '''
    Preprocesses raw image bytes for prediction.
    '''
    # load raw image from dataframe and resize it to ResNet specifications
    img = Image.open(io.BytesIO(content)).resize([224, 224])
    # Enhance image
    img = enhance(img)
    # image to Tensor array
    arr = img_to_array(img)
    # return ResNet50 preprocessed image
    return preprocess_input(arr)


def featurize_series(model, content_series):
    '''
    Featurize a pd.Series of raw images using the input model.
    :return: a pd.Series of image features
    '''
    #   input = np.stack(content_series.map(preprocess))
    input = tf.convert_to_tensor(np.stack(content_series.map(preprocess)), dtype=tf.float32)
    # features from image
    preds = model.predict(input)
    # For some layers, output features will be multi-dimensional tensors.
    # We flatten the feature tensors to vectors for easier storage in Spark DataFrames.
    output = [p.flatten() for p in preds]
    # return features vector
    return pd.Series(output)


In [14]:
from typing import Iterator

@pandas_udf('array<float>')
def featurize_udf(content_series_iter: Iterator[pd.Series]) -> Iterator[pd.Series]:
  '''
  This method is a Scalar Iterator pandas UDF wrapping our featurization function.
  The decorator specifies that this returns a Spark DataFrame column of type ArrayType(FloatType).
  
  :param content_series_iter: This argument is an iterator over batches of data, where each batch
                              is a pandas Series of image data.
  ''' 
  # With Scalar Iterator pandas UDFs, we can load the model once and then re-use it
  # for multiple data batches.  This amortizes the overhead of loading big models.
  model = model_fn()
  for content_series in content_series_iter:
    yield featurize_series(model, content_series)

Apply featurization to the DataFrame of images

In [15]:
# Avoiding Out Of Memory (OOM) errors by reducing the Arrow batch size
spark.conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '12')

In [16]:
# Transfert learning 
images = images.withColumn('features', featurize_udf(images.content))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)



# Standadization and PCA reduction

In [17]:
# UDF array -> vector
list_to_vector_udf = udf(lambda vs: Vectors.dense([float(i) for i in vs]),
                         VectorUDT())
# Create new column with vectors
images = images.withColumn('Vect_features', list_to_vector_udf(images.features))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- Vect_features: vector (nullable = true)



In [18]:
reduction = PipelineModel.load(MODEL_PATH)

In [19]:
# Apply reduction
images = reduction.transform(images)

# Vector to array

In [20]:
# Transform denseVector to array

images = images.withColumn('feat_array', vector_to_array('PCA_features'))
images.printSchema()

root
 |-- path: string (nullable = true)
 |-- modificationTime: timestamp (nullable = true)
 |-- length: long (nullable = true)
 |-- content: binary (nullable = true)
 |-- label: string (nullable = true)
 |-- features: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- Vect_features: vector (nullable = true)
 |-- Scaled_features: vector (nullable = true)
 |-- PCA_features: vector (nullable = true)
 |-- feat_array: array (nullable = false)
 |    |-- element: double (containsNull = false)



In [21]:
images.show(1)

+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                path|   modificationTime|length|             content|         label|            features|       Vect_features|     Scaled_features|        PCA_features|          feat_array|
+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|s3a://fruits-imag...|2021-09-29 08:39:38|  5473|[FF D8 FF E0 00 1...|Apple Braeburn|[1.5647818, 0.324...|[1.56478178501129...|[1.09162302719634...|[-1.9048678493216...|[-1.9048678493216...|
+--------------------+-------------------+------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



# Enregistrement

In [22]:
# Save Spark DataFrame, partitionned by label, in S3 Bucket
(images
 .select('path','label','feat_array')
 .write
 .partitionBy('label')
 .mode('overwrite')
 .parquet(SAVE_PATH)
) 

# End Spark session

In [23]:
spark.stop()