In [18]:
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()    
)

read images using image format and glob pattern in path

In [19]:
import os

df = spark.read.format("image").load(os.path.join("/opt/workspace", "media/images/catsAndDogs40/train/*/*.jpg"))
df.show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

it appears to show a single column as struct object, let's look at the schema

In [8]:
df.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)


okay, so let's expand image dictionary into columns

In [9]:
df.select("image.*").show(5)

+--------------------+------+-----+---------+----+--------------------+
|              origin|height|width|nChannels|mode|                data|
+--------------------+------+-----+---------+----+--------------------+
|file:///opt/works...|   160|  160|        3|  16|[0B 17 17 13 20 1...|
|file:///opt/works...|   160|  160|        3|  16|[32 3E 3E 33 42 3...|
|file:///opt/works...|   160|  160|        3|  16|[F2 EF EB F3 F0 E...|
|file:///opt/works...|   160|  160|        3|  16|[FA C5 82 FA C5 8...|
|file:///opt/works...|   160|  160|        3|  16|[40 53 58 54 6A 6...|
+--------------------+------+-----+---------+----+--------------------+


extract image features

In [29]:
import pyspark.sql.functions as F
from pyspark.sql.pandas.functions import pandas_udf
import pyspark.sql.types as T
import pandas as pd
from PIL import ImageStat, Image

@pandas_udf(returnType=T.ArrayType(T.FloatType()))
def mean_pixel_levels(height: pd.Series, width: pd.Series, bytes: pd.Series) -> pd.Series:
    mode = 'YCbCr'
    return pd.Series(ImageStat.Stat(Image.frombytes(mode, (h,w), b)).mean for h,w,b in list(zip(height, width, bytes)))
    

features_df = df.select("image.*")
features_df = features_df.withColumn("mean_pixel_levels", mean_pixel_levels(F.col("height"),F.col("width"),F.col("data")))
features_df.show(5)


+--------------------+------+-----+---------+----+--------------------+--------------------+
|              origin|height|width|nChannels|mode|                data|   mean_pixel_levels|
+--------------------+------+-----+---------+----+--------------------+--------------------+
|file:///opt/works...|   160|  160|        3|  16|[0B 17 17 13 20 1...|[105.68774, 122.5...|
|file:///opt/works...|   160|  160|        3|  16|[32 3E 3E 33 42 3...|[90.60562, 100.55...|
|file:///opt/works...|   160|  160|        3|  16|[F2 EF EB F3 F0 E...|[107.34871, 119.9...|
|file:///opt/works...|   160|  160|        3|  16|[FA C5 82 FA C5 8...|[117.33078, 136.5...|
|file:///opt/works...|   160|  160|        3|  16|[40 53 58 54 6A 6...|[65.62848, 96.271...|
+--------------------+------+-----+---------+----+--------------------+--------------------+
