In [1]:
import io
import numpy as np
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, regexp_extract
from PIL import Image

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from os.path import abspath
import os

# SparkSession
URL_SPARK = "spark://spark-master:7077"
warehouse_location = './spark-warehouse'

spark = (
    SparkSession.builder
    .appName("spark-ml-multiVM")
    .config("executor.memory", "8g")
    .config("spark.sql.warehouse.dir", warehouse_location)
    .master(URL_SPARK)
    .getOrCreate()
)

/usr/local/lib/python3.9/dist-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/02/01 01:15:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
images = spark.read.format("binaryFile") \
            .option("recursiveFileLookup", "true") \
            .option("pathGlobFilter", "*.jpg") \
            .load("./data")

In [4]:
# images.count()

In [5]:
def extract_label(path_col):
    return regexp_extract(path_col, "flower_photo/([^/]+)", 1)

In [6]:
def extract_size(content):
    image = Image.open(io.BytesIO(content))
    return image.size

In [7]:
@pandas_udf("width: int, height: int")
def extract_size_udf(content_series):
    sizes = content_series.apply(extract_size)
    return pd.DataFrame(list(sizes))

In [8]:
df = images.select(
    col("path"),
    extract_size_udf(col("content")).alias("size"),
    extract_label(col("path")).alias("label"),
    col("content")
)

In [9]:
# df.show(5)

In [10]:
import io
 
from tensorflow.keras.applications.imagenet_utils import decode_predictions
import pandas as pd
from pyspark.sql.functions import col, pandas_udf, PandasUDFType
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image

2024-02-01 01:15:32.546822: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-01 01:15:32.546930: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-01 01:15:32.548136: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-01 01:15:32.557537: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
class ImageNetDataset(Dataset):
  """
  Converts image contents into a PyTorch Dataset with standard ImageNet preprocessing.
  """
  def __init__(self, contents):
    self.contents = contents
 
  def __len__(self):
    return len(self.contents)
 
  def __getitem__(self, index):
    return self._preprocess(self.contents[index])
 
  def _preprocess(self, content):
    """
    Preprocesses the input image content using standard ImageNet normalization.
    
    See https://pytorch.org/docs/stable/torchvision/models.html.
    """
    image = Image.open(io.BytesIO(content))
    transform = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    return transform(image)

In [12]:
def imagenet_model_udf(model_fn):
  """
  Wraps an ImageNet model into a Pandas UDF that makes predictions.
  
  You might consider the following customizations for your own use case:
    - Tune DataLoader's batch_size and num_workers for better performance.
    - Use GPU for acceleration.
    - Change prediction types.
  """
  def predict(content_series_iter):
    model = model_fn()
    model.eval()
    for content_series in content_series_iter:
      dataset = ImageNetDataset(list(content_series))
      loader = DataLoader(dataset, batch_size=64)
      with torch.no_grad():
        for image_batch in loader:
          predictions = model(image_batch).numpy()
          predicted_labels = [x[0] for x in decode_predictions(predictions, top=1)]
          yield pd.DataFrame(predicted_labels)
  return_type = "class: string, desc: string, score:float"
  return pandas_udf(return_type, PandasUDFType.SCALAR_ITER)(predict)

In [13]:
resnet50 = models.resnet50(pretrained=True)
bc_resnet50_state = spark.sparkContext.broadcast(resnet50.state_dict())

def resnet50_fn():
    model = models.resnet50(pretrained=True)
    model.load_state_dict(bc_resnet50_state.value)
    return model

resnet50_udf = imagenet_model_udf(resnet50_fn)



In [14]:
my_image = df.limit(5)
predictions = my_image.withColumn("prediction", resnet50_udf(col("content")))

In [15]:
predictions.show()

                                                                                

+--------------------+----------+-----+--------------------+--------------------+
|                path|      size|label|             content|          prediction|
+--------------------+----------+-----+--------------------+--------------------+
|file:/opt/workspa...|{500, 441}|     |[FF D8 FF E0 00 1...|{n11939491, daisy...|
|file:/opt/workspa...|{500, 333}|     |[FF D8 FF E0 00 1...|{n02206856, bee, ...|
|file:/opt/workspa...|{500, 333}|     |[FF D8 FF E0 00 1...|{n03837869, obeli...|
|file:/opt/workspa...|{500, 290}|     |[FF D8 FF E0 00 1...|{n11939491, daisy...|
|file:/opt/workspa...|{500, 322}|     |[FF D8 FF E0 00 1...|{n03134739, croqu...|
+--------------------+----------+-----+--------------------+--------------------+

