In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Test Bronze Ingest") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/29 13:46:05 WARN Utils: Your hostname, pc, resolves to a loopback address: 127.0.1.1; using 192.168.0.5 instead (on interface wlxc04a00119951)
25/11/29 13:46:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/29 13:46:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/11/29 13:46:07 WARN SparkSession: Cannot use io.delta.sql.DeltaSparkSessionExtension to configure session extensions.
java.lang.ClassNotFoundException: io.delta.sql.DeltaSparkSessionExtension
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.ja

In [3]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, expr, count
from datetime import datetime
from pyspark.sql.functions import to_timestamp, col
from pyspark.sql.utils import AnalysisException


In [4]:
schedule_df = (
        spark.read.option("header", True)
        .csv("../data/schedule.csv")
    )

In [7]:
schedule_df.printSchema()

root
 |-- game_id: string (nullable = true)
 |-- round: string (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- game_start_time: string (nullable = true)



In [20]:
schedule_df.count()

380

In [8]:
schedule_df.show()

+--------------------+-----+-----------------+---------------+--------------------+
|             game_id|round|        home_team|      away_team|     game_start_time|
+--------------------+-----+-----------------+---------------+--------------------+
|9a7624be518dffb4c...|    1|         West Ham| Crystal Palace|2025-08-22T15:00:00Z|
|55970f64637d0f56d...|    1|           Wolves| Manchester Utd|2025-08-22T20:00:00Z|
|15feb050f27b2c46e...|    1|      Aston Villa|      Brentford|2025-08-23T18:00:00Z|
|f2bc560c91217f7e3...|    1|          Everton|       Brighton|2025-08-23T19:00:00Z|
|fcc94bed9cfb4412d...|    1|Tottenham Hotspur|        Burnley|2025-08-23T20:00:00Z|
|de44a47a4dfb2803d...|    1|           Fulham|     Nottingham|2025-08-23T20:00:00Z|
|73a4567666ebd9fcc...|    1|          Arsenal|        Chelsea|2025-08-24T15:00:00Z|
|f21b2961c297b2c60...|    1|        Liverpool|   Leeds United|2025-08-24T19:00:00Z|
|7b231677734908a6a...|    1|      Bournemouth|Manchester City|2025-08-24T20:

Check how many different teams we have

In [25]:
print(schedule_df.select("home_team").distinct().count())

20


In [26]:
print(schedule_df.select("away_team").distinct().count())

20


Since teams are 20 and we have 38 rounds we need to have 20x38

# Check if game_id has unique values

In [10]:
def column_unique(df: DataFrame, column: str) -> tuple[bool, str]:
    total = df.count()
    distinct = df.select(column).distinct().count()
    if total != distinct:
        return False, f"Column '{column}' has {total - distinct} duplicate values"
    return True, f"Column '{column}' is unique"

In [11]:
column_unique(schedule_df, "game_id")

(True, "Column 'game_id' is unique")

# Check if round containes only numbers (integers) and if it is between 1 and 38

In [13]:
def column_values_between(
            df: DataFrame, column: str, min_val: int, max_val: int
        ) -> tuple[bool, str]:

            # Use try_cast to avoid Spark exceptions
            cleaned = df.withColumn(column, expr(f"try_cast({column} as int)"))

            invalid = cleaned.filter(
                col(column).isNull() |
                (col(column) < min_val) |
                (col(column) > max_val)
            ).count()

            if invalid > 0:
                return False, (
                    f"Column '{column}' has {invalid} invalid values "
                    f"(non-numeric or outside range [{min_val}, {max_val}])"
                )

            return True, f"Column '{column}' values are valid"

In [15]:
column_values_between(schedule_df,"round", 1, 38 )

(True, "Column 'round' values are valid")

#### Check that home_team and away_team are not the same in one row

In [17]:
def diff_columns(df: DataFrame, col1: str, col2: str) -> tuple[bool, str]:
        """
        Expect that two columns are completely different in every row.
        Returns (False, message) if any row has col1 == col2 (or both null).
        """
        if col1 not in df.columns or col2 not in df.columns:
            return False, f"Missing one or both columns: '{col1}', '{col2}'"

        same_rows = df.filter(
            (col(col1) == col(col2)) | (col(col1).isNull() & col(col2).isNull())
        ).count()

        if same_rows > 0:
            return False, f"Found {same_rows} rows where '{col1}' == '{col2}'"
        return True, f"All rows have different values for '{col1}' and '{col2}'"


In [18]:
diff_columns(schedule_df, "home_team", "away_team")

(True, "All rows have different values for 'home_team' and 'away_team'")

Check in home_team and away_team that teams appear once in each column

In [29]:
def team_unique_per_round(df: DataFrame, column: str) -> tuple[bool, str]:
        """
        Expect that each team appears only once per round.
        Returns False if a team appears more than once as home_team or away_team in a round.
        """
        
        duplicates = (
            df.groupBy("round", column)
            .agg(count("*").alias("count"))
            .filter(col("count") > 1)
        )

        dupe_count = duplicates.count()

        if dupe_count > 0:
            return False, f"Found {dupe_count} duplicate {column}(s) within the same round"
        return True, f"Each {column} appears only once per round"



In [30]:
team_unique_per_round(schedule_df,"home_team")

(True, 'Each home_team appears only once per round')

In [31]:
team_unique_per_round(schedule_df,"away_team")

(True, 'Each away_team appears only once per round')

Focous on game_start_time 
1. all the values should be a valid date time.
2. values should be between 25/8/2025 - 18/5/2025

In [7]:
def column_datetime_in_range(
        df: DataFrame,
        column: str,
        start_date: datetime,
        end_date: datetime
    ) -> tuple[bool, str]:
        """
        Validates that a datetime column contains:
        1. Only valid parsable timestamps
        2. All values between start_date and end_date (inclusive)

        Automatically parses strings to timestamp if needed.
        """
        
        try:
            # Try parsing column to timestamp if needed
            df_ts = df.withColumn("parsed_ts", to_timestamp(col(column)))

            # Check unparseable rows
            invalid_ts_count = df_ts.filter(col("parsed_ts").isNull()).count()
            if invalid_ts_count > 0:
                return False, f"Column '{column}' has {invalid_ts_count} unparseable datetime values"

            # Now check date range (inclusive)
            df_range = df_ts.filter(
                (col("parsed_ts") < start_date) | (col("parsed_ts") > end_date)
            )
            out_of_range = df_range.count()

            if out_of_range > 0:
                return False, (
                    f"Column '{column}' has {out_of_range} values outside range "
                    f"The values are {df_range.show()}"
                    f"[{start_date.date()} to {end_date.date()}]"
                )

            return True, f"All values in column '{column}' are valid datetimes within range."

        except AnalysisException as e:
            return False, f"Failed to analyze column '{column}': {str(e)}"
        except Exception as e:
            return False, f"Unexpected error validating column '{column}': {str(e)}"


In [9]:
column_datetime_in_range(schedule_df,"game_start_time", datetime(2025, 8, 22), datetime(2026,5,18))

(True,
 "All values in column 'game_start_time' are valid datetimes within range.")