In [1]:
import pandas as pd
from helper_functions import get_master_df, save_df

In [2]:
df = get_master_df()
if "Unnamed: 0" in df.columns:
    df.drop("Unnamed: 0", axis=1, inplace=True)
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,blk%_team1,tov%_team1,ortg_team1,drtg_team1,team1,winner,season,date,team0_encoded,team1_encoded
0,240.0,47.0,97.0,0.485,15.0,41.0,0.366,13.0,19.0,0.684,...,16.1,16.0,118.6,119.6,GSW,HOU,2018,2017-10-17,29,21
1,240.0,36.0,88.0,0.409,8.0,32.0,0.250,19.0,25.0,0.760,...,7.1,15.3,102.7,99.7,CLE,CLE,2018,2017-10-17,2,6
2,240.0,33.0,83.0,0.398,12.0,45.0,0.267,27.0,29.0,0.931,...,10.5,15.5,103.9,109.1,SAC,HOU,2018,2017-10-18,29,23
3,240.0,29.0,73.0,0.397,9.0,30.0,0.300,23.0,29.0,0.793,...,7.0,7.3,103.6,91.4,DET,DET,2018,2017-10-18,13,8
4,240.0,48.0,94.0,0.511,9.0,18.0,0.500,12.0,15.0,0.800,...,5.3,13.6,112.6,118.7,DAL,ATL,2018,2017-10-18,12,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,20.7,19.7,116.7,103.5,PHO,PHO,2024,2024-03-20,5,24
8464,240.0,40.0,90.0,0.444,10.0,36.0,0.278,17.0,18.0,0.944,...,13.0,9.0,119.9,107.8,OKC,OKC,2024,2024-03-20,18,16
8465,240.0,50.0,88.0,0.568,11.0,29.0,0.379,11.0,18.0,0.611,...,1.7,12.7,100.7,119.3,DET,IND,2024,2024-03-20,7,8
8466,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,10.2,8.4,131.1,127.9,BOS,BOS,2024,2024-03-20,10,2


In [3]:
team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

In [4]:
unique_stats = [col.split("_")[0] + "_prev_game" for col in df.columns if "_team0" in col]
stats = [col.split("_")[0] for col in df.columns if "_team0" in col]
teams = team_encoding.keys()

In [5]:
def initialize_prev_game_stats():
    prev_game = {}
    for team in teams:
        team_prev_game = {}
        for stat in unique_stats:
            team_prev_game[stat] = 0
        prev_game[team] = team_prev_game
        prev_game[team]["games_played"] = 0
    
    return prev_game

In [6]:
prev_game = initialize_prev_game_stats()
current_season = None

first_game_indices = set()
    
for index, row in df.iterrows():
    team0 = row['team0']
    team1 = row['team1']
    game_season = row['season']

    if game_season != current_season:
        prev_game = initialize_prev_game_stats()
        current_season = game_season

    games_played_team0 = prev_game[team0]["games_played"]
    games_played_team1 = prev_game[team1]["games_played"]

    if games_played_team0 == 0:
        first_game_indices.add(index)
    
    if games_played_team1 == 0:
        first_game_indices.add(index)

    if games_played_team0 > 0:
        for stat in unique_stats:
            prev_game_team0 = prev_game[team0][stat]
            df.at[index, f"{stat}_team0"] = prev_game_team0
    
    if games_played_team1 > 0:
        for stat in unique_stats:
            prev_avg_team1 = prev_game[team1][stat]
            df.at[index, f"{stat}_team1"] = prev_avg_team1
    
    for stat in stats:
        prev_game[team0][f"{stat}_prev_game"] = row[f"{stat}_team0"]
        prev_game[team1][f"{stat}_prev_game"] = row[f"{stat}_team1"]

    prev_game[team0]['games_played'] += 1
    prev_game[team1]['games_played'] += 1

    if index % 100 == 0:
         print(f"{index} / {len(df)}")

0 / 8468
100 / 8468
200 / 8468
300 / 8468
400 / 8468
500 / 8468
600 / 8468
700 / 8468
800 / 8468
900 / 8468
1000 / 8468
1100 / 8468
1200 / 8468
1300 / 8468
1400 / 8468
1500 / 8468
1600 / 8468
1700 / 8468
1800 / 8468
1900 / 8468
2000 / 8468
2100 / 8468
2200 / 8468
2300 / 8468
2400 / 8468
2500 / 8468
2600 / 8468
2700 / 8468
2800 / 8468
2900 / 8468
3000 / 8468
3100 / 8468
3200 / 8468
3300 / 8468
3400 / 8468
3500 / 8468
3600 / 8468
3700 / 8468
3800 / 8468
3900 / 8468
4000 / 8468
4100 / 8468
4200 / 8468
4300 / 8468
4400 / 8468
4500 / 8468
4600 / 8468
4700 / 8468
4800 / 8468
4900 / 8468
5000 / 8468
5100 / 8468
5200 / 8468
5300 / 8468
5400 / 8468
5500 / 8468
5600 / 8468
5700 / 8468
5800 / 8468
5900 / 8468
6000 / 8468
6100 / 8468
6200 / 8468
6300 / 8468
6400 / 8468
6500 / 8468
6600 / 8468
6700 / 8468
6800 / 8468
6900 / 8468
7000 / 8468
7100 / 8468
7200 / 8468
7300 / 8468
7400 / 8468
7500 / 8468
7600 / 8468
7700 / 8468
7800 / 8468
7900 / 8468
8000 / 8468
8100 / 8468
8200 / 8468
8300 / 8468
8400

In [7]:
first_game_indices

df = df.drop(first_game_indices)

In [8]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,ftr_prev_game_team1,orb%_prev_game_team1,drb%_prev_game_team1,trb%_prev_game_team1,ast%_prev_game_team1,stl%_prev_game_team1,blk%_prev_game_team1,tov%_prev_game_team1,ortg_prev_game_team1,drtg_prev_game_team1
16,240.0,37.0,87.0,0.425,10.0,23.0,0.435,9.0,13.0,0.692,...,0.244,20.0,69.6,45.1,71.1,5.1,5.3,13.6,112.6,118.7
17,240.0,46.0,95.0,0.484,9.0,19.0,0.474,13.0,17.0,0.765,...,0.314,30.4,75.0,52.2,54.7,10.6,14.1,10.8,123.6,115.7
18,240.0,45.0,89.0,0.506,18.0,33.0,0.545,13.0,15.0,0.867,...,0.340,25.0,69.6,47.8,48.9,6.2,2.9,15.6,115.7,123.6
19,240.0,35.0,84.0,0.417,10.0,29.0,0.345,22.0,32.0,0.688,...,0.204,23.4,69.8,48.0,58.1,5.8,8.0,14.4,110.3,115.1
20,240.0,43.0,87.0,0.494,10.0,23.0,0.435,15.0,20.0,0.750,...,0.392,30.2,76.6,52.0,50.0,7.7,17.2,7.3,115.1,110.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,0.172,21.9,76.2,52.7,70.6,7.2,9.4,11.4,132.2,143.4
8464,240.0,40.0,90.0,0.444,10.0,36.0,0.278,17.0,18.0,0.944,...,0.241,21.6,84.1,55.6,55.6,7.2,21.6,10.3,121.3,115.1
8465,240.0,50.0,88.0,0.568,11.0,29.0,0.379,11.0,18.0,0.611,...,0.179,17.4,70.0,44.8,58.3,6.5,6.5,11.7,101.3,128.2
8466,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,0.137,30.0,82.6,55.2,69.8,8.6,14.5,8.2,128.2,101.3


In [9]:
exceptions = ['team1', 'winner', 'season', 'date', 'team0', 'team0_encoded', 'team1_encoded']

cols_to_keep = [col for col in df.columns if "_prev_game" in col or col in exceptions]

df_filtered = df[cols_to_keep]

In [10]:
df['team1_winner'] = df.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)
df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)


In [11]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,mp_prev_game_team0,fg_prev_game_team0,fga_prev_game_team0,...,orb%_prev_game_team1,drb%_prev_game_team1,trb%_prev_game_team1,ast%_prev_game_team1,stl%_prev_game_team1,blk%_prev_game_team1,tov%_prev_game_team1,ortg_prev_game_team1,drtg_prev_game_team1,team1_winner
16,SAC,DAL,SAC,2018,2017-10-20,23,27,240.0,42.0,88.0,...,20.0,69.6,45.1,71.1,5.1,5.3,13.6,112.6,118.7,0
17,POR,IND,POR,2018,2017-10-20,17,7,240.0,44.0,90.0,...,30.4,75.0,52.2,54.7,10.6,14.1,10.8,123.6,115.7,0
18,ORL,BRK,BRK,2018,2017-10-20,15,4,240.0,43.0,90.0,...,25.0,69.6,47.8,48.9,6.2,2.9,15.6,115.7,123.6,1
19,BOS,PHI,BOS,2018,2017-10-20,2,5,240.0,39.0,91.0,...,23.4,69.8,48.0,58.1,5.8,8.0,14.4,110.3,115.1,0
20,DET,WAS,WAS,2018,2017-10-20,8,14,240.0,41.0,96.0,...,30.2,76.6,52.0,50.0,7.7,17.2,7.3,115.1,110.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,PHI,PHO,PHO,2024,2024-03-20,5,24,240.0,37.0,85.0,...,21.9,76.2,52.7,70.6,7.2,9.4,11.4,132.2,143.4,1
8464,UTA,OKC,OKC,2024,2024-03-20,18,16,240.0,39.0,84.0,...,21.6,84.1,55.6,55.6,7.2,21.6,10.3,121.3,115.1,1
8465,IND,DET,IND,2024,2024-03-20,7,8,240.0,42.0,91.0,...,17.4,70.0,44.8,58.3,6.5,6.5,11.7,101.3,128.2,0
8466,MIL,BOS,BOS,2024,2024-03-20,10,2,240.0,51.0,94.0,...,30.0,82.6,55.2,69.8,8.6,14.5,8.2,128.2,101.3,1


In [12]:
df_filtered.reset_index(drop=True, inplace=True)

In [13]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,mp_prev_game_team0,fg_prev_game_team0,fga_prev_game_team0,...,orb%_prev_game_team1,drb%_prev_game_team1,trb%_prev_game_team1,ast%_prev_game_team1,stl%_prev_game_team1,blk%_prev_game_team1,tov%_prev_game_team1,ortg_prev_game_team1,drtg_prev_game_team1,team1_winner
0,SAC,DAL,SAC,2018,2017-10-20,23,27,240.0,42.0,88.0,...,20.0,69.6,45.1,71.1,5.1,5.3,13.6,112.6,118.7,0
1,POR,IND,POR,2018,2017-10-20,17,7,240.0,44.0,90.0,...,30.4,75.0,52.2,54.7,10.6,14.1,10.8,123.6,115.7,0
2,ORL,BRK,BRK,2018,2017-10-20,15,4,240.0,43.0,90.0,...,25.0,69.6,47.8,48.9,6.2,2.9,15.6,115.7,123.6,1
3,BOS,PHI,BOS,2018,2017-10-20,2,5,240.0,39.0,91.0,...,23.4,69.8,48.0,58.1,5.8,8.0,14.4,110.3,115.1,0
4,DET,WAS,WAS,2018,2017-10-20,8,14,240.0,41.0,96.0,...,30.2,76.6,52.0,50.0,7.7,17.2,7.3,115.1,110.3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8350,PHI,PHO,PHO,2024,2024-03-20,5,24,240.0,37.0,85.0,...,21.9,76.2,52.7,70.6,7.2,9.4,11.4,132.2,143.4,1
8351,UTA,OKC,OKC,2024,2024-03-20,18,16,240.0,39.0,84.0,...,21.6,84.1,55.6,55.6,7.2,21.6,10.3,121.3,115.1,1
8352,IND,DET,IND,2024,2024-03-20,7,8,240.0,42.0,91.0,...,17.4,70.0,44.8,58.3,6.5,6.5,11.7,101.3,128.2,0
8353,MIL,BOS,BOS,2024,2024-03-20,10,2,240.0,51.0,94.0,...,30.0,82.6,55.2,69.8,8.6,14.5,8.2,128.2,101.3,1


In [14]:
save_df(df_filtered, "prev_game_df.csv")