In [None]:
-- Install required libraries (if not already installed)
!pip install pandas pyspark

In [None]:
-- Import necessary modules
from pyspark.sql import SparkSession
import pandas as pd

In [None]:
-- Start Spark session
spark = SparkSession.builder.appName("DraftAnalysis").getOrCreate()

In [None]:
-- Load files into Spark DataFrames
draft_data = spark.read.csv("/mnt/data/2024_draft_leagues_1283_recap_combined.csv", header=True, inferSchema=True)
player_data = spark.read.csv("/mnt/data/2024_SC_Player_list.csv", header=True, inferSchema=True)

In [None]:
-- Import functions from PySpark
from pyspark.sql import functions as F

In [None]:
-- Calculate median score
median_data = player_data.groupBy("player_id").agg(
    F.expr('percentile_approx(points, 0.5)').alias('median_score')
).orderBy(F.desc('median_score'))

-- Replace NA values with 0
median_data = median_data.fillna({'median_score': 0})

In [None]:
-- Join dataframes
heat_map_data = draft_data.join(
    median_data, draft_data["player_id"] == median_data["player_id"], "left"
)

In [None]:
-- Normalize scores by position
positions = ['DEF', 'MID', 'RUC', 'FWD']
pos_count = [8*5, 8*7, 8*1, 8*5]

for i, pos in enumerate(positions):
    pos_data = heat_map_data.filter(
        (heat_map_data["pos_1"] == pos) | (heat_map_data["pos_2"] == pos)
    ).orderBy(F.desc('median_score'))

    mean_score = pos_data.select(F.mean("median_score")).collect()[0][0]
    stdev_score = pos_data.select(F.stddev("median_score")).collect()[0][0]

    -- Calculate normalized scores
    pos_data = pos_data.withColumn(
        f"{pos}_norm_score",
        F.round((F.col("median_score") - mean_score) / stdev_score, 3)
    )

    -- Add normalized scores to main data
    heat_map_data = heat_map_data.join(
        pos_data.select("player_id", f"{pos}_norm_score"),
        on="player_id",
        how="left"
    )

In [None]:
-- Calculate final score
heat_map_data = heat_map_data.withColumn(
    "final_score",
    F.round(
        F.greatest(
            F.col("DEF_norm_score"),
            F.col("MID_norm_score"),
            F.col("RUC_norm_score"),
            F.col("FWD_norm_score")
        ), 4
    )
)

In [None]:
-- Save the final dataframe as a CSV
heat_map_data.write.csv("/mnt/data/2024_draft_heat_map.csv", header=True, mode="overwrite")