In [None]:
import numpy as np
from dask.distributed import Client, LocalCluster
import dask.array as da
import dask.dataframe as dd
import pandas as pd

import paths
import math

In [None]:
# Housekeeping 
cluster = LocalCluster(n_workers=6)
client = Client(cluster)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
ss = dd.read_csv(paths.interim + "/SeasonStats/*.part").drop("Unnamed: 0", axis=1)
reg_season_results = dd.read_csv(paths.raw + "/MRegularSeasonDetailedResults.csv")

In [None]:
ss.head()

In [None]:
reg_season_results.head()

In [None]:
# For model training, I am scrambling the order of columns, so team 1 doesn't always win
scrambled_games = reg_season_results[["Season", "WTeamID", "LTeamID"]].copy()

swap_arr = da.random.randint(0,2,len(scrambled_games))

scrambled_games["Swap"] = swap_arr
scrambled_games["Parity"] = scrambled_games["WTeamID"] ^ scrambled_games["LTeamID"]

scrambled_games["T1_ID"] = scrambled_games["WTeamID"] ^ (scrambled_games["Swap"] * scrambled_games["Parity"])
scrambled_games["T2_ID"] = scrambled_games["LTeamID"] ^ (scrambled_games["Swap"] * scrambled_games["Parity"])
scrambled_games["T1_Win_Indicator"] = 1 ^ scrambled_games["Swap"]

scrambled_games = scrambled_games[["Season", "T1_ID", "T2_ID", "T1_Win_Indicator"]]

scrambled_games.head()

In [None]:
# Scrambled games with season statistics
sg_with_ss = scrambled_games.copy()

sg_with_ss = sg_with_ss.merge(ss, left_on=["Season", "T1_ID"], right_on=["Season", "TeamID"])

sg_with_ss = sg_with_ss.merge(ss, \
                                    left_on=["Season", "T2_ID"], right_on=["Season", "TeamID"], \
                                    suffixes=["_T1", "_T2"])

sg_with_ss = sg_with_ss.drop(["TeamID_T1", "TeamID_T2"], axis=1)


In [None]:
sg_with_ss.head()

In [None]:
sg_with_ss.to_csv(paths.interim + "/ScrambledGamesWithSeasonStatistics")

In [None]:

# For all statistics, find the ratio and difference between team 1 and team 2
# This code block sets the label mappings to make that processing possible

t1_columns = sg_with_ss.columns[4:42]
t2_columns = sg_with_ss.columns[42:]

ratio_columns = []
diff_columns = []

column_match = {}
for i in range(len(t1_columns)):
    column_match[t1_columns[i]] = t2_columns[i]
    
    base_label = t1_columns[i].split('_')[0]
    
    ratio_label = base_label + '_R'
    ratio_columns.append(ratio_label)
    
    diff_label = base_label + '_D'
    diff_columns.append(diff_label)
    


In [None]:
sg_with_ss_calcs = sg_with_ss.copy()

# Finding ratios between season statistics of opposing teams
for index, (key, value) in enumerate(column_match.items()):
    sg_with_ss_calcs[ratio_columns[index]] = sg_with_ss_calcs[key] / sg_with_ss_calcs[value]

# Finding differences between season statistics of opposing teams
for index, (key, value) in enumerate(column_match.items()):
    sg_with_ss_calcs[diff_columns[index]] = sg_with_ss_calcs[key] - sg_with_ss_calcs[value]
    
sg_with_ss_calcs

In [None]:
# Only include ratios and differences in the data we will use to train
rd_cols = sg_with_ss_calcs.columns[:4].append(sg_with_ss_calcs.columns[-76:])

In [None]:
sg_with_ss_calcs = sg_with_ss_calcs[rd_cols]

In [None]:
sg_with_ss_calcs = sg_with_ss_calcs.repartition(5)

# Scrambled Games With Ratios, Differences
sg_with_ss_calcs.to_csv(paths.processed + "/EngineeredTrainData")