In [None]:
from pyspark.sql import SparkSession
import cv2
import numpy as np
from pyspark.sql.functions import udf
from pyspark.sql.types import BinaryType, StringType, StructType, StructField
import os
import pandas as pd

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ResizeImages") \
    .getOrCreate()

# Path to the directory containing the original images
input_directory = "/workspaces/a/Fake"

# Path to the directory where resized images will be saved (local file system path)
output_directory = "/workspaces/a"

# Define the resize function
def resize_image(image_path, width=250, height=250):
    # Read the image using OpenCV
    img = cv2.imread(image_path)
    # Resize the image
    resized_img = cv2.resize(img, (width, height))
    # Encode the resized image back into binary format
    retval, buffer = cv2.imencode('.jpg', resized_img)
    resized_image_data = buffer.tobytes()
    return resized_image_data

# Register the UDF (User Defined Function)
resize_image_udf = udf(resize_image, BinaryType())

# List of image file paths in the input directory
image_paths = [os.path.join(input_directory, file) for file in os.listdir(input_directory)]

# Create a schema for the DataFrame
schema = StructType([StructField("image_path", StringType(), nullable=True)])

# Create a DataFrame with the image file paths and schema
image_paths_df = spark.createDataFrame([(path,) for path in image_paths], schema)

# Apply the resize function to the DataFrame
resized_images_df = image_paths_df.withColumn("resized_image", resize_image_udf(image_paths_df["image_path"]))

# Convert the DataFrame to a Pandas DataFrame
pandas_df = resized_images_df.toPandas()

# Save the resized images using OpenCV
for index, row in pandas_df.iterrows():
    image_data = np.frombuffer(row['resized_image'], dtype=np.uint8)
    img = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
    cv2.imwrite(os.path.join(output_directory, f"resized_image_{index}.jpg"), img)


In [8]:
pandas_df.resized_image[0]

bytearray(b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00\xff\xdb\x00C\x00\x02\x01\x01\x01\x01\x01\x02\x01\x01\x01\x02\x02\x02\x02\x02\x04\x03\x02\x02\x02\x02\x05\x04\x04\x03\x04\x06\x05\x06\x06\x06\x05\x06\x06\x06\x07\t\x08\x06\x07\t\x07\x06\x06\x08\x0b\x08\t\n\n\n\n\n\x06\x08\x0b\x0c\x0b\n\x0c\t\n\n\n\xff\xdb\x00C\x01\x02\x02\x02\x02\x02\x02\x05\x03\x03\x05\n\x07\x06\x07\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xff\xc0\x00\x11\x08\x00\xfa\x00\xfa\x03\x01"\x00\x02\x11\x01\x03\x11\x01\xff\xc4\x00\x1f\x00\x00\x01\x05\x01\x01\x01\x01\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\xff\xc4\x00\xb5\x10\x00\x02\x01\x03\x03\x02\x04\x03\x05\x05\x04\x04\x00\x00\x01}\x01\x02\x03\x00\x04\x11\x05\x12!1A\x06\x13Qa\x07"q\x142\x81\x91\xa1\x08#B\xb1\xc1\x15R\xd1\xf0$3br\x82\t\n\x16\x17\x18\x19\x1a%&\'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz\x83\x84\x85\x86\x87\x88\x89\x8a\x92\x93\x94\x9