In [75]:
import pandas as pd
from helper_functions import get_master_df, save_df, drop_extraneous_col

In [76]:
df = get_master_df()

In [77]:
drop_extraneous_col(df)

In [78]:
original_columns = df.columns

original_columns

Index(['mp_team0', 'fg_team0', 'fga_team0', 'fg%_team0', '3p_team0',
       '3pa_team0', '3p%_team0', 'ft_team0', 'fta_team0', 'ft%_team0',
       'orb_team0', 'drb_team0', 'trb_team0', 'ast_team0', 'stl_team0',
       'blk_team0', 'tov_team0', 'pf_team0', 'pts_team0', 'ts%_team0',
       'efg%_team0', '3par_team0', 'ftr_team0', 'orb%_team0', 'drb%_team0',
       'trb%_team0', 'ast%_team0', 'stl%_team0', 'blk%_team0', 'tov%_team0',
       'ortg_team0', 'drtg_team0', 'ft/fga_team0', 'team0', 'mp_team1',
       'fg_team1', 'fga_team1', 'fg%_team1', '3p_team1', '3pa_team1',
       '3p%_team1', 'ft_team1', 'fta_team1', 'ft%_team1', 'orb_team1',
       'drb_team1', 'trb_team1', 'ast_team1', 'stl_team1', 'blk_team1',
       'tov_team1', 'pf_team1', 'pts_team1', 'ts%_team1', 'efg%_team1',
       '3par_team1', 'ftr_team1', 'orb%_team1', 'drb%_team1', 'trb%_team1',
       'ast%_team1', 'stl%_team1', 'blk%_team1', 'tov%_team1', 'ortg_team1',
       'drtg_team1', 'ft/fga_team1', 'team1', 'winner'

In [79]:
team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

In [80]:
unique_stats = [col.split("_")[0] + "_cumulative" for col in df.columns if "_team0" in col and "restDays" not in col]
stats = [col.split("_")[0] for col in df.columns if "_team0" in col and "restDays" not in col]
teams = team_encoding.keys()

In [81]:
def initialize_cumulative_average():
    cumulative_averages = {}
    for team in teams:
        team_cumulative_average = {}
        for stat in unique_stats:
            team_cumulative_average[stat] = 0
        cumulative_averages[team] = team_cumulative_average
        cumulative_averages[team]["games_played"] = 0
    
    return cumulative_averages

In [82]:
for stat in unique_stats:
    df[f'{stat}_team0'] = 0.0
    df[f'{stat}_team1'] = 0.0

In [87]:
cumulative_averages = initialize_cumulative_average()
current_season = None

first_game_indices = set()
    
for index, row in df.iterrows():
    team0 = row['team0']
    team1 = row['team1']
    game_season = row['season']

    if game_season != current_season:
        cumulative_averages = initialize_cumulative_average()
        current_season = game_season

    games_played_team0 = cumulative_averages[team0]["games_played"]
    games_played_team1 = cumulative_averages[team1]["games_played"]

    if games_played_team0 == 0:
        first_game_indices.add(index)
    
    if games_played_team1 == 0:
        first_game_indices.add(index)

    if games_played_team0 > 0:
        for stat in unique_stats:
            prev_avg_team0 = cumulative_averages[team0][stat] / games_played_team0
            df.at[index, f"{stat}_team0"] = prev_avg_team0
    
    if games_played_team1 > 0:
        for stat in unique_stats:
            prev_avg_team1 = cumulative_averages[team1][stat] / games_played_team1
            df.at[index, f"{stat}_team1"] = prev_avg_team1
    
    for stat in stats:
        cumulative_averages[team0][f"{stat}_cumulative"] += row[f"{stat}_team0"]
        cumulative_averages[team1][f"{stat}_cumulative"] += row[f"{stat}_team1"]

    cumulative_averages[team0]['games_played'] += 1
    cumulative_averages[team1]['games_played'] += 1

    if index % 100 == 0:
         print(f"{index} / {len(df)}")

100 / 8355
200 / 8355
300 / 8355
400 / 8355
500 / 8355
600 / 8355
700 / 8355
800 / 8355
900 / 8355
1000 / 8355
1100 / 8355
1200 / 8355
1400 / 8355
1500 / 8355
1600 / 8355
1700 / 8355
1800 / 8355
1900 / 8355
2000 / 8355
2100 / 8355
2200 / 8355
2300 / 8355
2400 / 8355
2500 / 8355
2600 / 8355
2700 / 8355
2800 / 8355
2900 / 8355
3000 / 8355
3100 / 8355
3200 / 8355
3300 / 8355
3400 / 8355
3500 / 8355
3600 / 8355
3700 / 8355
3800 / 8355
3900 / 8355
4000 / 8355
4100 / 8355
4200 / 8355
4300 / 8355
4400 / 8355
4500 / 8355
4600 / 8355
4700 / 8355
4800 / 8355
4900 / 8355
5000 / 8355
5100 / 8355
5200 / 8355
5300 / 8355
5400 / 8355
5500 / 8355
5600 / 8355
5700 / 8355
5800 / 8355
5900 / 8355
6000 / 8355
6100 / 8355
6200 / 8355
6300 / 8355
6400 / 8355
6500 / 8355
6600 / 8355
6700 / 8355
6800 / 8355
6900 / 8355
7000 / 8355
7100 / 8355
7200 / 8355
7300 / 8355
7400 / 8355
7500 / 8355
7600 / 8355
7700 / 8355
7800 / 8355
7900 / 8355
8000 / 8355
8100 / 8355
8200 / 8355
8300 / 8355
8400 / 8355


In [88]:
first_game_indices
df = df.drop(first_game_indices)

In [89]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,blk%_cumulative_team0,blk%_cumulative_team1,tov%_cumulative_team0,tov%_cumulative_team1,ortg_cumulative_team0,ortg_cumulative_team1,drtg_cumulative_team0,drtg_cumulative_team1,ft/fga_cumulative_team0,ft/fga_cumulative_team1
26,240.0,40.0,90.0,0.444,17.0,35.0,0.486,17.0,20.0,0.850,...,4.600000,6.400000,18.700000,16.100000,112.600000,125.200000,117.300000,104.700000,0.146000,0.210000
33,240.0,34.0,81.0,0.420,11.0,28.0,0.393,31.0,35.0,0.886,...,10.000000,3.600000,8.100000,13.800000,117.000000,104.700000,98.500000,125.200000,0.137000,0.122000
37,240.0,47.0,83.0,0.566,10.0,32.0,0.313,15.0,21.0,0.714,...,7.800000,4.800000,13.600000,14.300000,111.800000,112.700000,119.300000,110.900000,0.143000,0.258000
38,240.0,43.0,86.0,0.500,10.0,25.0,0.400,19.0,23.0,0.826,...,3.600000,8.300000,14.000000,16.400000,105.200000,93.600000,102.000000,103.300000,0.259000,0.154000
39,240.0,32.0,94.0,0.340,7.0,27.0,0.259,33.0,36.0,0.917,...,8.000000,10.700000,11.400000,14.400000,86.300000,117.300000,103.400000,112.600000,0.126000,0.272000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,8.633333,11.823881,10.837879,10.302985,120.007576,124.274627,116.674242,112.116418,0.220303,0.188418
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,10.917910,10.908955,10.025373,12.538806,118.026866,119.164179,115.497015,116.835821,0.229866,0.233045
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,7.064063,8.768657,11.051563,11.595522,114.259375,116.595522,113.482812,112.425373,0.216172,0.184597
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,9.454545,8.428788,11.219697,12.681818,120.398485,110.204545,116.680303,118.416667,0.217288,0.193727


In [90]:
exceptions = ['team1', 'winner', 'season', 'date', 'team0', 'team0_encoded', 'team1_encoded', 'restDays_team0', 'restDays_team1']

cols_to_keep = [col for col in df.columns if "cumulative" in col or col in exceptions]

df_filtered = df[cols_to_keep]


In [91]:
df['team1_winner'] = df.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)
df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)


In [92]:
df_filtered.reset_index(drop=True, inplace=True)

In [93]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,blk%_cumulative_team1,tov%_cumulative_team0,tov%_cumulative_team1,ortg_cumulative_team0,ortg_cumulative_team1,drtg_cumulative_team0,drtg_cumulative_team1,ft/fga_cumulative_team0,ft/fga_cumulative_team1,team1_winner
26,240.0,40.0,90.0,0.444,17.0,35.0,0.486,17.0,20.0,0.850,...,6.400000,18.700000,16.100000,112.600000,125.200000,117.300000,104.700000,0.146000,0.210000,0
33,240.0,34.0,81.0,0.420,11.0,28.0,0.393,31.0,35.0,0.886,...,3.600000,8.100000,13.800000,117.000000,104.700000,98.500000,125.200000,0.137000,0.122000,1
37,240.0,47.0,83.0,0.566,10.0,32.0,0.313,15.0,21.0,0.714,...,4.800000,13.600000,14.300000,111.800000,112.700000,119.300000,110.900000,0.143000,0.258000,0
38,240.0,43.0,86.0,0.500,10.0,25.0,0.400,19.0,23.0,0.826,...,8.300000,14.000000,16.400000,105.200000,93.600000,102.000000,103.300000,0.259000,0.154000,0
39,240.0,32.0,94.0,0.340,7.0,27.0,0.259,33.0,36.0,0.917,...,10.700000,11.400000,14.400000,86.300000,117.300000,103.400000,112.600000,0.126000,0.272000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,11.823881,10.837879,10.302985,120.007576,124.274627,116.674242,112.116418,0.220303,0.188418,1
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,10.908955,10.025373,12.538806,118.026866,119.164179,115.497015,116.835821,0.229866,0.233045,1
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,8.768657,11.051563,11.595522,114.259375,116.595522,113.482812,112.425373,0.216172,0.184597,0
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,8.428788,11.219697,12.681818,120.398485,110.204545,116.680303,118.416667,0.217288,0.193727,0


In [94]:
save_df(df_filtered, "cumulative_averages.csv")