# Testing with PySpark

More information:

https://spark.apache.org/docs/latest/api/python/getting_started/testing_pyspark.html


## Build simple PySpark application

In [None]:
!pip install pyspark==3.5.1

!pip install numpy==1.26.4

In [None]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("Testing PySpark Example").getOrCreate()

## Test using PySpark functions

### assertDataFrameEqual

In [None]:
from pyspark.testing.utils import assertDataFrameEqual

# Example 1
df1 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], schema=["id", "amount"])
df2 = spark.createDataFrame(data=[("1", 1000), ("2", 3000)], schema=["id", "amount"])
assertDataFrameEqual(df1, df2)  # pass, DataFrames are identical

TO DO: make above assertion fail (e.g. by changing the amount in one of the dataframes).

In [None]:
# Example 2
df1 = spark.createDataFrame(data=[("1", 0.1), ("2", 3.23)], schema=["id", "amount"])
df2 = spark.createDataFrame(data=[("1", 0.109), ("2", 3.23)], schema=["id", "amount"])
assertDataFrameEqual(df1, df2, rtol=1e-1)  # pass, DataFrames are approx equal by rtol

TO DO: make above assertion fail by changing the tolerance value not the entries of the dataframes.

### assertSchemaEqual

In [None]:
from pyspark.testing.utils import assertSchemaEqual
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType

s1 = StructType([StructField("names", ArrayType(DoubleType(), True), True)])
s2 = StructType([StructField("names", ArrayType(DoubleType(), True), True)])

assertSchemaEqual(s1, s2)  # pass, schemas are identical

TO DO: make above assertion fail by changing the schema of one of the dataframes.

In [None]:
schema_actual = "name STRING, amount DOUBLE"

data_expected = [["Alfred", 1500], ["Alfred", 2500], ["Anna", 500], ["Anna", 3000]]
data_actual = [["Alfred", 1500.0], ["Alfred", 2500.0], ["Anna", 500.0], ["Anna", 3000.0]]

df_expected = spark.createDataFrame(data = data_expected)
df_actual = spark.createDataFrame(data = data_actual, schema = schema_actual)

assertSchemaEqual(df_actual.schema, df_expected.schema)

TO DO: check if automatically created schema matches the pre-defined one. Change the pre-defined schema to match the automatically assigned one, or add the pre-defined schema to the first dataframe for the test to pass.

### PySparkAssertionError

In [None]:
df_expected = spark.createDataFrame(data=[("Alfred", 1500), ("Alfred", 2500), 
("Anna", 500), ("Anna", 3000)], schema=["name", "amount"])
df_actual = spark.createDataFrame(data=[("Alfred", 1200), ("Alfred", 2500), ("Anna", 
500), ("Anna", 3000)], schema=["name", "amount"])

df_expected.show()

In [None]:
from pyspark.testing import assertDataFrameEqual
from pyspark.errors import PySparkAssertionError

try:
    assertDataFrameEqual(df_actual, df_expected)
except PySparkAssertionError as e:
    print("Error: DataFrames are not equal!")
    print(e)