In [5]:
import pandas as pd
from helper_functions import get_master_df, save_df, drop_extraneous_col

In [6]:
df = get_master_df()

In [7]:
drop_extraneous_col(df)

In [8]:
original_columns = df.columns

original_columns

Index(['mp_team0', 'fg_team0', 'fga_team0', 'fg%_team0', '3p_team0',
       '3pa_team0', '3p%_team0', 'ft_team0', 'fta_team0', 'ft%_team0',
       'orb_team0', 'drb_team0', 'trb_team0', 'ast_team0', 'stl_team0',
       'blk_team0', 'tov_team0', 'pf_team0', 'pts_team0', 'ts%_team0',
       'efg%_team0', '3par_team0', 'ftr_team0', 'orb%_team0', 'drb%_team0',
       'trb%_team0', 'ast%_team0', 'stl%_team0', 'blk%_team0', 'tov%_team0',
       'ortg_team0', 'drtg_team0', 'ft/fga_team0', 'team0', 'mp_team1',
       'fg_team1', 'fga_team1', 'fg%_team1', '3p_team1', '3pa_team1',
       '3p%_team1', 'ft_team1', 'fta_team1', 'ft%_team1', 'orb_team1',
       'drb_team1', 'trb_team1', 'ast_team1', 'stl_team1', 'blk_team1',
       'tov_team1', 'pf_team1', 'pts_team1', 'ts%_team1', 'efg%_team1',
       '3par_team1', 'ftr_team1', 'orb%_team1', 'drb%_team1', 'trb%_team1',
       'ast%_team1', 'stl%_team1', 'blk%_team1', 'tov%_team1', 'ortg_team1',
       'drtg_team1', 'ft/fga_team1', 'team1', 'winner'

In [9]:
team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

In [10]:
unique_stats = [col.split("_")[0] + "_cumulative" for col in df.columns if "_team0" in col]
stats = [col.split("_")[0] for col in df.columns if "_team0" in col]
teams = team_encoding.keys()

In [11]:
def initialize_cumulative_average():
    cumulative_averages = {}
    for team in teams:
        team_cumulative_average = {}
        for stat in unique_stats:
            team_cumulative_average[stat] = 0
        cumulative_averages[team] = team_cumulative_average
        cumulative_averages[team]["games_played"] = 0
    
    return cumulative_averages

In [12]:
for stat in unique_stats:
    df[f'{stat}_team0'] = 0.0
    df[f'{stat}_team1'] = 0.0

In [13]:
cumulative_averages = initialize_cumulative_average()
current_season = None

first_game_indices = set()
    
for index, row in df.iterrows():
    team0 = row['team0']
    team1 = row['team1']
    game_season = row['season']

    if game_season != current_season:
        cumulative_averages = initialize_cumulative_average()
        current_season = game_season

    games_played_team0 = cumulative_averages[team0]["games_played"]
    games_played_team1 = cumulative_averages[team1]["games_played"]

    if games_played_team0 == 0:
        first_game_indices.add(index)
    
    if games_played_team1 == 0:
        first_game_indices.add(index)

    if games_played_team0 > 0:
        for stat in unique_stats:
            prev_avg_team0 = cumulative_averages[team0][stat] / games_played_team0
            df.at[index, f"{stat}_team0"] = prev_avg_team0
    
    if games_played_team1 > 0:
        for stat in unique_stats:
            prev_avg_team1 = cumulative_averages[team1][stat] / games_played_team1
            df.at[index, f"{stat}_team1"] = prev_avg_team1
    
    for stat in stats:
        cumulative_averages[team0][f"{stat}_cumulative"] += row[f"{stat}_team0"]
        cumulative_averages[team1][f"{stat}_cumulative"] += row[f"{stat}_team1"]

    cumulative_averages[team0]['games_played'] += 1
    cumulative_averages[team1]['games_played'] += 1

    if index % 100 == 0:
         print(f"{index} / {len(df)}")

0 / 8468
100 / 8468
200 / 8468
300 / 8468
400 / 8468
500 / 8468
600 / 8468
700 / 8468
800 / 8468
900 / 8468
1000 / 8468
1100 / 8468
1200 / 8468
1300 / 8468
1400 / 8468
1500 / 8468
1600 / 8468
1700 / 8468
1800 / 8468
1900 / 8468
2000 / 8468
2100 / 8468
2200 / 8468
2300 / 8468
2400 / 8468
2500 / 8468
2600 / 8468
2700 / 8468
2800 / 8468
2900 / 8468
3000 / 8468
3100 / 8468
3200 / 8468
3300 / 8468
3400 / 8468
3500 / 8468
3600 / 8468
3700 / 8468
3800 / 8468
3900 / 8468
4000 / 8468
4100 / 8468
4200 / 8468
4300 / 8468
4400 / 8468
4500 / 8468
4600 / 8468
4700 / 8468
4800 / 8468
4900 / 8468
5000 / 8468
5100 / 8468
5200 / 8468
5300 / 8468
5400 / 8468
5500 / 8468
5600 / 8468
5700 / 8468
5800 / 8468
5900 / 8468
6000 / 8468
6100 / 8468
6200 / 8468
6300 / 8468
6400 / 8468
6500 / 8468
6600 / 8468
6700 / 8468
6800 / 8468
6900 / 8468
7000 / 8468
7100 / 8468
7200 / 8468
7300 / 8468
7400 / 8468
7500 / 8468
7600 / 8468
7700 / 8468
7800 / 8468
7900 / 8468
8000 / 8468
8100 / 8468
8200 / 8468
8300 / 8468
8400

In [14]:
first_game_indices

df = df.drop(first_game_indices)

In [15]:
if "Unnamed: 0" in df.columns:
    df.drop("Unnamed: 0", axis=1, inplace=True)

In [16]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,blk%_cumulative_team0,blk%_cumulative_team1,tov%_cumulative_team0,tov%_cumulative_team1,ortg_cumulative_team0,ortg_cumulative_team1,drtg_cumulative_team0,drtg_cumulative_team1,ft/fga_cumulative_team0,ft/fga_cumulative_team1
16,240.0,44.0,81.0,0.543,11.0,26.0,0.423,17.0,17.0,1.000,...,7.100000,6.300000,15.300000,14.400000,102.700000,110.400000,99.700000,102.200000,0.253000,0.329000
17,240.0,48.0,93.0,0.516,12.0,26.0,0.462,24.0,33.0,0.727,...,9.500000,12.100000,15.800000,14.200000,86.900000,77.000000,102.000000,125.600000,0.154000,0.146000
18,240.0,47.0,92.0,0.511,18.0,41.0,0.439,16.0,20.0,0.800,...,16.100000,19.000000,16.000000,15.600000,118.600000,97.600000,119.600000,110.400000,0.238000,0.304000
19,240.0,45.0,89.0,0.506,18.0,33.0,0.545,13.0,15.0,0.867,...,12.700000,2.900000,12.000000,15.600000,110.300000,115.700000,103.600000,123.600000,0.244000,0.309000
20,240.0,35.0,84.0,0.417,10.0,29.0,0.345,22.0,32.0,0.688,...,5.100000,8.000000,9.950000,14.400000,100.950000,110.300000,106.550000,115.100000,0.168500,0.151000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,8.637313,11.938235,10.864179,10.339706,119.949254,124.129412,116.650746,112.086765,0.221627,0.189853
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,10.872059,10.926471,10.083824,12.583824,117.983824,118.973529,115.505882,116.622059,0.229985,0.231632
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,7.032308,8.794118,10.978462,11.560294,114.193846,116.600000,113.412308,112.476471,0.216354,0.183294
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,9.485075,8.408955,11.274627,12.708955,120.468657,110.244776,116.623881,118.516418,0.216194,0.192925


In [17]:
exceptions = ['team1', 'winner', 'season', 'date', 'team0', 'team0_encoded', 'team1_encoded']

cols_to_keep = [col for col in df.columns if "cumulative" in col or col in exceptions]

df_filtered = df[cols_to_keep]


In [18]:
df['team1_winner'] = df.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)
df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)


In [19]:
df_filtered.reset_index(drop=True, inplace=True)

In [20]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,mp_cumulative_team0,mp_cumulative_team1,fg_cumulative_team0,...,blk%_cumulative_team1,tov%_cumulative_team0,tov%_cumulative_team1,ortg_cumulative_team0,ortg_cumulative_team1,drtg_cumulative_team0,drtg_cumulative_team1,ft/fga_cumulative_team0,ft/fga_cumulative_team1,team1_winner
0,CLE,MIL,CLE,2018,2017-10-20,6,10,240.000000,240.000000,38.000000,...,6.300000,15.300000,14.400000,102.700000,110.400000,99.700000,102.200000,0.253000,0.329000,0
1,LAL,PHO,LAL,2018,2017-10-20,25,24,240.000000,240.000000,37.000000,...,12.100000,15.800000,14.200000,86.900000,77.000000,102.000000,125.600000,0.154000,0.146000,0
2,GSW,NOP,GSW,2018,2017-10-20,21,30,240.000000,240.000000,43.000000,...,19.000000,16.000000,15.600000,118.600000,97.600000,119.600000,110.400000,0.238000,0.304000,0
3,ORL,BRK,BRK,2018,2017-10-20,15,4,240.000000,240.000000,43.000000,...,2.900000,12.000000,15.600000,110.300000,115.700000,103.600000,123.600000,0.244000,0.309000,1
4,BOS,PHI,BOS,2018,2017-10-20,2,5,240.000000,240.000000,37.500000,...,8.000000,9.950000,14.400000,100.950000,110.300000,106.550000,115.100000,0.168500,0.151000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,MIL,BOS,BOS,2024,2024-03-20,10,2,241.119403,241.838235,43.268657,...,11.938235,10.864179,10.339706,119.949254,124.129412,116.650746,112.086765,0.221627,0.189853,1
8351,PHI,PHO,PHO,2024,2024-03-20,5,24,240.735294,241.470588,41.455882,...,10.926471,10.083824,12.583824,117.983824,118.973529,115.505882,116.622059,0.229985,0.231632,1
8352,MIA,CLE,MIA,2024,2024-03-20,11,6,240.384615,241.838235,39.461538,...,8.794118,10.978462,11.560294,114.193846,116.600000,113.412308,112.476471,0.216354,0.183294,0
8353,LAC,POR,LAC,2024,2024-03-20,22,17,240.373134,242.985075,42.507463,...,8.408955,11.274627,12.708955,120.468657,110.244776,116.623881,118.516418,0.216194,0.192925,0


In [21]:
save_df(df_filtered, "cumulative_averages.csv")

In [22]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,mp_cumulative_team0,mp_cumulative_team1,fg_cumulative_team0,...,blk%_cumulative_team1,tov%_cumulative_team0,tov%_cumulative_team1,ortg_cumulative_team0,ortg_cumulative_team1,drtg_cumulative_team0,drtg_cumulative_team1,ft/fga_cumulative_team0,ft/fga_cumulative_team1,team1_winner
0,CLE,MIL,CLE,2018,2017-10-20,6,10,240.000000,240.000000,38.000000,...,6.300000,15.300000,14.400000,102.700000,110.400000,99.700000,102.200000,0.253000,0.329000,0
1,LAL,PHO,LAL,2018,2017-10-20,25,24,240.000000,240.000000,37.000000,...,12.100000,15.800000,14.200000,86.900000,77.000000,102.000000,125.600000,0.154000,0.146000,0
2,GSW,NOP,GSW,2018,2017-10-20,21,30,240.000000,240.000000,43.000000,...,19.000000,16.000000,15.600000,118.600000,97.600000,119.600000,110.400000,0.238000,0.304000,0
3,ORL,BRK,BRK,2018,2017-10-20,15,4,240.000000,240.000000,43.000000,...,2.900000,12.000000,15.600000,110.300000,115.700000,103.600000,123.600000,0.244000,0.309000,1
4,BOS,PHI,BOS,2018,2017-10-20,2,5,240.000000,240.000000,37.500000,...,8.000000,9.950000,14.400000,100.950000,110.300000,106.550000,115.100000,0.168500,0.151000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,MIL,BOS,BOS,2024,2024-03-20,10,2,241.119403,241.838235,43.268657,...,11.938235,10.864179,10.339706,119.949254,124.129412,116.650746,112.086765,0.221627,0.189853,1
8351,PHI,PHO,PHO,2024,2024-03-20,5,24,240.735294,241.470588,41.455882,...,10.926471,10.083824,12.583824,117.983824,118.973529,115.505882,116.622059,0.229985,0.231632,1
8352,MIA,CLE,MIA,2024,2024-03-20,11,6,240.384615,241.838235,39.461538,...,8.794118,10.978462,11.560294,114.193846,116.600000,113.412308,112.476471,0.216354,0.183294,0
8353,LAC,POR,LAC,2024,2024-03-20,22,17,240.373134,242.985075,42.507463,...,8.408955,11.274627,12.708955,120.468657,110.244776,116.623881,118.516418,0.216194,0.192925,0
