In [35]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Test Bronze Ingest") \
    .master("local[*]") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()


In [39]:
from pyspark.sql import functions as F

In [36]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, expr, count
from pyspark.sql.functions import col, when, sum as spark_sum

In [40]:
from datetime import datetime, timedelta

START_DATE = datetime(2025, 8, 18, 8, 0)  # Monday 08:00

def get_weekly_run_datetime(iteration: int) -> datetime:
    """
    Given an iteration number (0â€“38), return the Monday 08:00 datetime
    corresponding to that iteration.
    """
    if iteration < 0:
        raise ValueError("Iteration must be >= 0")
    
    return START_DATE + timedelta(weeks=iteration)


In [206]:
scores_df = (
        spark.read.option("header", True)
        .csv("../data/scores.csv")
    )

In [207]:
schedule_df = (
        spark.read.option("header", True)
        .csv("../data/schedule.csv")
    )

In [220]:
start_time = get_weekly_run_datetime(1)
end_time = get_weekly_run_datetime(2)

print(f"start_time: {start_time}, end_time: {end_time}")

start_time: 2025-08-25 08:00:00, end_time: 2025-09-01 08:00:00


In [215]:
scores_df_2 = scores_df.withColumn(
    "ingestion_timestamp", F.col("ingestion_time").cast("timestamp")
).filter(
    (F.col("ingestion_timestamp") > start_time) &
    (F.col("ingestion_timestamp") <= end_time)
).drop("ingestion_timestamp")

In [216]:
scores_df_2.count()

9

In [217]:
schedule_df_2 = schedule_df.join(
        scores_df_2.select("game_id"),
        on="game_id", how="left_semi"
        
    )

In [218]:
schedule_df_2.count()

9

In [219]:
schedule_df_2.show()

+--------------------+-----+------------+---------------+--------------------+
|             game_id|round|   home_team|      away_team|     game_start_time|
+--------------------+-----+------------+---------------+--------------------+
|07b3a6ff878548296...|    1|   Newcastle|     Sunderland|2025-08-25T18:00:00Z|
|5274552fdd6188eeb...|    2| Aston Villa|     Sunderland|2025-08-29T15:00:00Z|
|ba62488f3b28d8fc1...|    2|Leeds United|        Arsenal|2025-08-29T19:00:00Z|
|f04f165c57b424f86...|    2|    West Ham|       Brighton|2025-08-29T20:00:00Z|
|f47b46d09f6959bee...|    2|     Everton|     Nottingham|2025-08-29T20:00:00Z|
|fdd011e399610f2c0...|    2|      Wolves|Manchester City|2025-08-30T15:00:00Z|
|0fb3dcae1fe83c76d...|    2|   Liverpool|      Brentford|2025-08-31T15:00:00Z|
|9c9f1674b2d1f2400...|    2|   Newcastle|        Burnley|2025-08-31T18:00:00Z|
|09be137260fd71d52...|    2|      Fulham| Manchester Utd|2025-08-31T18:00:00Z|
+--------------------+-----+------------+-----------

In [203]:
schedule_df_2.select("game_id").distinct().count()

9

In [205]:
schedule_df_2.show()

+--------------------+-----+-----------------+---------------+--------------------+
|             game_id|round|        home_team|      away_team|     game_start_time|
+--------------------+-----+-----------------+---------------+--------------------+
|9a7624be518dffb4c...|    1|         West Ham| Crystal Palace|2025-08-22T15:00:00Z|
|55970f64637d0f56d...|    1|           Wolves| Manchester Utd|2025-08-22T20:00:00Z|
|15feb050f27b2c46e...|    1|      Aston Villa|      Brentford|2025-08-23T18:00:00Z|
|f2bc560c91217f7e3...|    1|          Everton|       Brighton|2025-08-23T19:00:00Z|
|fcc94bed9cfb4412d...|    1|Tottenham Hotspur|        Burnley|2025-08-23T20:00:00Z|
|de44a47a4dfb2803d...|    1|           Fulham|     Nottingham|2025-08-23T20:00:00Z|
|73a4567666ebd9fcc...|    1|          Arsenal|        Chelsea|2025-08-24T15:00:00Z|
|73a4567666ebd9fcc...|    1|          Arsenal|        Chelsea|2025-08-24T15:00:00Z|
|f21b2961c297b2c60...|    1|        Liverpool|   Leeds United|2025-08-24T19: