In [101]:
import pandas as pd
import numpy as np
from helper_functions import get_master_df, save_df, drop_extraneous_col

In [102]:
# df = get_master_df()

df = pd.read_csv("csvs/master_df.csv", parse_dates=['date'])
drop_extraneous_col(df)

In [103]:
team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

In [104]:
teams = team_encoding.keys()
last_game_date = {}

df['rest_days_team0'] = np.NaN
df['rest_days_team1'] = np.NaN

In [105]:
for index, row in df.iterrows():
    team0, team1, game_date, season = row['team0'], row['team1'], row['date'], row['season']    
    # Define a unique key for each team and season
    team0_key = f"{team0}_{season}"
    team1_key = f"{team1}_{season}"
    
    # Calculate rest days for team0
    if team0_key in last_game_date:
        rest_days_team0 = (game_date - pd.to_datetime(last_game_date[team0_key])).days - 1
    else:
        rest_days_team0 = 0
    df.at[index, 'rest_days_team0'] = rest_days_team0
    
    # Calculate rest days for team1
    if team1_key in last_game_date:
        rest_days_team1 = (game_date - pd.to_datetime(last_game_date[team1_key])).days - 1
    else:
        rest_days_team1 = 0
    df.at[index, 'rest_days_team1'] = rest_days_team1
    
    # Update the last game date for both teams, considering the season
    last_game_date[team0_key] = game_date
    last_game_date[team1_key] = game_date

# If you prefer to replace None values with 0 or another placeholder:
df['rest_days_team0'] = df['rest_days_team0'].fillna(0)
df['rest_days_team1'] = df['rest_days_team1'].fillna(0)

In [106]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,drtg_team1,ft/fga_team1,team1,winner,season,date,team0_encoded,team1_encoded,rest_days_team0,rest_days_team1
0,240.0,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,0.760,...,99.7,0.253,CLE,CLE,2018,2017-10-17,2,6,0.0,0.0
1,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,119.6,0.238,GSW,HOU,2018,2017-10-17,29,21,0.0,0.0
2,240.0,30.0,79.0,0.380,7.0,25.0,0.280,24.0,29.0,0.828,...,97.6,0.174,MEM,MEM,2018,2017-10-18,30,28,0.0,0.0
3,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,104.8,0.191,SAS,SAS,2018,2017-10-18,20,26,0.0,0.0
4,240.0,38.0,76.0,0.500,7.0,21.0,0.333,25.0,30.0,0.833,...,110.4,0.121,BOS,MIL,2018,2017-10-18,10,2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,127.9,0.282,BOS,BOS,2024,2024-03-20,10,2,2.0,1.0
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,103.5,0.067,PHO,PHO,2024,2024-03-20,5,24,1.0,2.0
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,121.3,0.253,CLE,MIA,2024,2024-03-20,11,6,1.0,1.0
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,122.6,0.123,POR,LAC,2024,2024-03-20,22,17,2.0,1.0


In [107]:
df['rest_days_team0'].describe()

count    8468.000000
mean        1.298536
std         4.956565
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max       144.000000
Name: rest_days_team0, dtype: float64

In [108]:
df['rest_days_team1'].describe()

count    8468.000000
mean        1.443788
std         5.421811
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max       145.000000
Name: rest_days_team1, dtype: float64

In [109]:
def find_stats(data: pd.Series):
    mean = data.mean()
    sd = data.std()

    upper_bound = (mean + 3 * sd)
    lower_bound = (mean - 3 * sd)

    return mean, lower_bound, upper_bound

In [110]:
rest_days_team0 = df["rest_days_team0"]
mean, lower_bound, upper_bound = find_stats(rest_days_team0)

outliers_mask = (rest_days_team0 < lower_bound) | (rest_days_team0 > upper_bound)

df.loc[outliers_mask, 'rest_days_team0'] = 0

In [111]:
rest_days_team1 = df["rest_days_team1"]
mean, lower_bound, upper_bound = find_stats(rest_days_team1)

outliers_mask = (rest_days_team1 < lower_bound) | (rest_days_team1 > upper_bound)

df.loc[outliers_mask, 'rest_days_team1'] = 0

In [113]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,drtg_team1,ft/fga_team1,team1,winner,season,date,team0_encoded,team1_encoded,rest_days_team0,rest_days_team1
0,240.0,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,0.760,...,99.7,0.253,CLE,CLE,2018,2017-10-17,2,6,0.0,0.0
1,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,119.6,0.238,GSW,HOU,2018,2017-10-17,29,21,0.0,0.0
2,240.0,30.0,79.0,0.380,7.0,25.0,0.280,24.0,29.0,0.828,...,97.6,0.174,MEM,MEM,2018,2017-10-18,30,28,0.0,0.0
3,240.0,37.0,85.0,0.435,8.0,19.0,0.421,17.0,23.0,0.739,...,104.8,0.191,SAS,SAS,2018,2017-10-18,20,26,0.0,0.0
4,240.0,38.0,76.0,0.500,7.0,21.0,0.333,25.0,30.0,0.833,...,110.4,0.121,BOS,MIL,2018,2017-10-18,10,2,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,127.9,0.282,BOS,BOS,2024,2024-03-20,10,2,2.0,1.0
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,103.5,0.067,PHO,PHO,2024,2024-03-20,5,24,1.0,2.0
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,121.3,0.253,CLE,MIA,2024,2024-03-20,11,6,1.0,1.0
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,122.6,0.123,POR,LAC,2024,2024-03-20,22,17,2.0,1.0


In [115]:
save_df(df, "master_df.csv")