In [2]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder.master("local").appName("TimingComparison").getOrCreate()

# Read data from a CSV file
df = spark.read.csv("data.csv", header=True, inferSchema=True)

# Register the PySpark DataFrame as a temporary table
df.createOrReplaceTempView("my_table")

# Define a PySpark Pandas UDF
@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)
def my_pandas_udf(data: pd.DataFrame) -> pd.DataFrame:
    # Perform data processing using Pandas DataFrame operations
    processed_data = data[data['column'] > 10][['column1', 'column2']].groupby('column1').sum()
    return processed_data

# Start the timer for PySpark Pandas API
start_time_pyspark = time.time()

# Apply the PySpark Pandas UDF and count the number of rows
processed_df = df.groupby('column').apply(my_pandas_udf)
pandas_count_pyspark = processed_df.count()

# Calculate the execution time for PySpark Pandas API
execution_time_pyspark = time.time() - start_time_pyspark

# Convert the processed DataFrame to Pandas DataFrame
pandas_df = processed_df.toPandas()

# Start the timer for pure Pandas
start_time_pandas = time.time()

# Perform additional data processing using Pandas DataFrame operations and count the number of rows
pandas_result = pandas_df.groupby('column1').sum()
pandas_count_pandas = pandas_result.shape[0]

# Calculate the execution time for pure Pandas
execution_time_pandas = time.time() - start_time_pandas

# Display the counts and execution times
print("PySpark Pandas API Count:", pandas_count_pyspark)
print("PySpark Pandas API Execution Time:", execution_time_pyspark)

print("Pure Pandas Count:", pandas_count_pandas)
print("Pure Pandas Execution Time:", execution_time_pandas)

# Stop the SparkSession
spark.stop()


23/05/26 16:26:09 ERROR Executor: Exception in task 0.0 in stage 6.0 (TID 5)/ 1]
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 830, in main
    process()
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 822, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 345, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 86, in dump_stream
    for batch in iterator:
  File "/home/alin/.local/lib/py

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 830, in main
    process()
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 822, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 345, in dump_stream
    return ArrowStreamSerializer.dump_stream(self, init_stream_yield_batches(), stream)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 86, in dump_stream
    for batch in iterator:
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/sql/pandas/serializers.py", line 338, in init_stream_yield_batches
    for series in iterator:
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 593, in mapper
    return f(keys, vals)
           ^^^^^^^^^^^^^
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 209, in <lambda>
    return lambda k, v: [(wrapped(k, v), to_arrow_type(return_type))]
                          ^^^^^^^^^^^^^
  File "/home/alin/.local/lib/python3.11/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 202, in wrapped
    raise RuntimeError(
RuntimeError: Number of columns of the returned pandas.DataFrame doesn't match specified schema. Expected: 3 Actual: 1


In [22]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder.master("local").appName("TimingComparison").config("spark.cores.max", "1").getOrCreate()

# Read data from a CSV file
df = spark.read.csv("data.csv", header=True, inferSchema=True)

# Register the PySpark DataFrame as a temporary table
df.createOrReplaceTempView("my_table")

# Define the schema for the output DataFrame
output_schema = StructType([
    StructField("column1", StringType(), nullable=False),
    StructField("column2", IntegerType(), nullable=False)
])

# Define a PySpark Pandas UDF
@pandas_udf(output_schema, PandasUDFType.GROUPED_MAP)
def my_pandas_udf(data: pd.DataFrame) -> pd.DataFrame:
    # Perform data processing using Pandas DataFrame operations
    processed_data = data.groupby('column1')['column2'].sum().reset_index()
    return processed_data

# Start the timer for PySpark Pandas API
start_time_pyspark = time.time()

# Apply the PySpark Pandas UDF and count the number of rows
processed_df = df.groupby('column1').apply(my_pandas_udf)
pandas_count_pyspark = processed_df.count()

# Calculate the execution time for PySpark Pandas API
execution_time_pyspark = time.time() - start_time_pyspark

# Convert the processed DataFrame to Pandas DataFrame
pandas_df = processed_df.toPandas()

# Start the timer for pure Pandas
start_time_pandas = time.time()

# Perform additional data processing using Pandas DataFrame operations and count the number of rows
pandas_result = pandas_df.groupby('column1')['column2'].sum().reset_index()
pandas_count_pandas = pandas_result.shape[0]

# Calculate the execution time for pure Pandas
execution_time_pandas = time.time() - start_time_pandas

# Display the counts and execution times
print("PySpark Pandas API Count:", pandas_count_pyspark)
print("PySpark Pandas API Execution Time:", execution_time_pyspark)

print("Pure Pandas Count:", pandas_count_pandas)
print("Pure Pandas Execution Time:", execution_time_pandas)

# Stop the SparkSession
spark.stop()


23/05/26 16:55:46 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


ArithmeticException: / by zero

In [5]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import IntegerType, StringType, StructField, StructType
import pandas as pd

# Create a SparkSession
spark = SparkSession.builder.master("local").appName("TimingComparison").getOrCreate()

spark.conf.set("spark.executor.memory", "1g")
spark.conf.set("spark.executor.cores", "1")

# Read data from a CSV file
df = spark.read.csv("data.csv", header=True, inferSchema=True)

# Register the PySpark DataFrame as a temporary table
df.createOrReplaceTempView("my_table")

# Define the schema for the output DataFrame
output_schema = StructType([
    StructField("column1", StringType(), nullable=False),
    StructField("column2", IntegerType(), nullable=False)
])

# Define a PySpark Pandas UDF
@pandas_udf(output_schema, PandasUDFType.GROUPED_MAP)
def my_pandas_udf(data: pd.DataFrame) -> pd.DataFrame:
    # Perform data processing using Pandas DataFrame operations
    processed_data = data.groupby('column1')['column2'].sum().reset_index()
    return processed_data

# Start the timer for PySpark Pandas API
start_time_pyspark = time.time()

# Apply the PySpark Pandas UDF and count the number of rows
processed_df = df.groupby('column1').apply(my_pandas_udf)
pandas_count_pyspark = processed_df.count()

# Calculate the execution time for PySpark Pandas API
execution_time_pyspark = time.time() - start_time_pyspark

# Convert the processed DataFrame to Pandas DataFrame
pandas_df = processed_df.toPandas()

# Start the timer for pure Pandas
start_time_pandas = time.time()

# Perform additional data processing using Pandas DataFrame operations and count the number of rows
pandas_result = pandas_df.groupby('column1')['column2'].sum().reset_index()
pandas_count_pandas = pandas_result.shape[0]

# Calculate the execution time for pure Pandas
execution_time_pandas = time.time() - start_time_pandas

# Display the counts and execution times
print("PySpark Pandas API Count:", pandas_count_pyspark)
print("PySpark Pandas API Execution Time:", execution_time_pyspark)

print("Pure Pandas Count:", pandas_count_pandas)
print("Pure Pandas Execution Time:", execution_time_pandas)

# Stop the SparkSession
spark.stop()


23/05/26 17:06:15 WARN SparkConf: Total executor cores: 1 is not divisible by cores per executor: 4, the left cores: 1 will not be allocated
                                                                                

PySpark Pandas API Count: 4
PySpark Pandas API Execution Time: 13.98938250541687
Pure Pandas Count: 4
Pure Pandas Execution Time: 0.0017251968383789062
