# Feature Engineering – Rolling Average Dataframe

The purpose of this notebook is to create the rolling average dataframe. As mentioned in our presentation, This dataframe averages statistics over the last 5 or 10 games (configure the window_size variable) for each team to capture short-term performance trends. We will be using an iterative approach to loop over every row in the dataframe and accordingly calculate the new values for every column. A hashmap will be leveraged to maintain each team's statistics. 

In [65]:
import pandas as pd
from helper_functions import get_master_df, save_df, drop_extraneous_col

### Begin Feature Engineering with Master DF

In [66]:
df = get_master_df()
drop_extraneous_col(df)

In [68]:
df.columns

Index(['mp_team0', 'fg_team0', 'fga_team0', 'fg%_team0', '3p_team0',
       '3pa_team0', '3p%_team0', 'ft_team0', 'fta_team0', 'ft%_team0',
       'orb_team0', 'drb_team0', 'trb_team0', 'ast_team0', 'stl_team0',
       'blk_team0', 'tov_team0', 'pf_team0', 'pts_team0', 'ts%_team0',
       'efg%_team0', '3par_team0', 'ftr_team0', 'orb%_team0', 'drb%_team0',
       'trb%_team0', 'ast%_team0', 'stl%_team0', 'blk%_team0', 'tov%_team0',
       'ortg_team0', 'drtg_team0', 'ft/fga_team0', 'team0', 'mp_team1',
       'fg_team1', 'fga_team1', 'fg%_team1', '3p_team1', '3pa_team1',
       '3p%_team1', 'ft_team1', 'fta_team1', 'ft%_team1', 'orb_team1',
       'drb_team1', 'trb_team1', 'ast_team1', 'stl_team1', 'blk_team1',
       'tov_team1', 'pf_team1', 'pts_team1', 'ts%_team1', 'efg%_team1',
       '3par_team1', 'ftr_team1', 'orb%_team1', 'drb%_team1', 'trb%_team1',
       'ast%_team1', 'stl%_team1', 'blk%_team1', 'tov%_team1', 'ortg_team1',
       'drtg_team1', 'ft/fga_team1', 'team1', 'winner'

### Team Encoding Object to Create Hashmap

In [69]:
team_encoding = { 
    # ATLANTIC
    "TOR": 1,
    "BOS": 2,
    "NYK": 3, 
    "BRK": 4,
    "PHI": 5,

    # CENTRAL
    "CLE": 6,
    "IND": 7,
    "DET": 8,
    "CHI": 9,
    "MIL": 10,

    # SOUTHEAST
    "MIA": 11,
    "ATL": 12,
    "CHO": 13,
    "WAS": 14,
    "ORL": 15,

    # NORTHWEST
    "OKC": 16,
    "POR": 17,
    "UTA": 18,
    "DEN": 19,
    "MIN": 20,

    # PACIFIC
    "GSW": 21, 
    "LAC": 22,
    "SAC": 23,
    "PHO": 24,
    "LAL": 25,

    # SOUTH WEST
    "SAS": 26,
    "DAL": 27,
    "MEM": 28,
    "HOU": 29,
    "NOP": 30
}

### Variables Required while Constructing New Columns

In [70]:
unique_stats = [col.split("_")[0] + "_rolling" for col in df.columns if "_team0" in col and "restDays" not in col]
stats = [col.split("_")[0] for col in df.columns if "_team0" in col and "restDays" not in col]
teams = team_encoding.keys()

unique_stats

['mp_rolling',
 'fg_rolling',
 'fga_rolling',
 'fg%_rolling',
 '3p_rolling',
 '3pa_rolling',
 '3p%_rolling',
 'ft_rolling',
 'fta_rolling',
 'ft%_rolling',
 'orb_rolling',
 'drb_rolling',
 'trb_rolling',
 'ast_rolling',
 'stl_rolling',
 'blk_rolling',
 'tov_rolling',
 'pf_rolling',
 'pts_rolling',
 'ts%_rolling',
 'efg%_rolling',
 '3par_rolling',
 'ftr_rolling',
 'orb%_rolling',
 'drb%_rolling',
 'trb%_rolling',
 'ast%_rolling',
 'stl%_rolling',
 'blk%_rolling',
 'tov%_rolling',
 'ortg_rolling',
 'drtg_rolling',
 'ft/fga_rolling']

### Leveraging Hashmaps

While we iterate through each row in the dataframe, we will be using a hashmap to keep track of each team's statistics.

In [71]:
from collections import defaultdict, deque

def initialize_rolling_average(window_size):
    return defaultdict(lambda: defaultdict(lambda: deque(maxlen=window_size)))

### Initialize New Columns

In [72]:
for stat in unique_stats:
    df[f'{stat}_team0'] = 0.0
    df[f'{stat}_team1'] = 0.0

### Begin filling New Columns with Data

To change the rolling_average window size, simply change the value in line 1 of the next code block. We set it to 5 and 10 to create both of our dataframes. 

In [None]:
window_size = 5 # previously set to 10

rolling_averages = initialize_rolling_average(window_size)
current_season = None

first_game_indices = set()

for index, row in df.iterrows():
    team0 = row['team0']
    team1 = row['team1']
    game_season = row['season']

    if game_season != current_season:
        rolling_averages = initialize_rolling_average(window_size)
        current_season = game_season

    games_played_team0 = len(rolling_averages[team0]['games_played'])
    games_played_team1 = len(rolling_averages[team1]['games_played'])

    if games_played_team0 < window_size:
        first_game_indices.add(index)

    if games_played_team1 < window_size:
        first_game_indices.add(index)

    for stat in unique_stats:
        if games_played_team0 >= window_size:
            rolling_avg_team0 = sum(rolling_averages[team0][stat]) / window_size
            df.at[index, f"{stat}_team0"] = rolling_avg_team0
        if games_played_team1 >= window_size:
            rolling_avg_team1 = sum(rolling_averages[team1][stat]) / window_size
            df.at[index, f"{stat}_team1"] = rolling_avg_team1

    for stat in stats:
        rolling_averages[team0][f"{stat}_rolling"].append(row[f"{stat}_team0"])
        rolling_averages[team1][f"{stat}_rolling"].append(row[f"{stat}_team1"])

    # Update games played
    rolling_averages[team0]['games_played'].append(1)
    rolling_averages[team1]['games_played'].append(1)

    if index % 100 == 0:
        print(f"{index} / {len(df)}")


0 / 8468
100 / 8468
200 / 8468
300 / 8468
400 / 8468
500 / 8468
600 / 8468
700 / 8468
800 / 8468
900 / 8468
1000 / 8468
1100 / 8468
1200 / 8468
1300 / 8468
1400 / 8468
1500 / 8468
1600 / 8468
1700 / 8468
1800 / 8468
1900 / 8468
2000 / 8468
2100 / 8468
2200 / 8468
2300 / 8468
2400 / 8468
2500 / 8468
2600 / 8468
2700 / 8468
2800 / 8468
2900 / 8468
3000 / 8468
3100 / 8468
3200 / 8468
3300 / 8468
3400 / 8468
3500 / 8468
3600 / 8468
3700 / 8468
3800 / 8468
3900 / 8468
4000 / 8468
4100 / 8468
4200 / 8468
4300 / 8468
4400 / 8468
4500 / 8468
4600 / 8468
4700 / 8468
4800 / 8468
4900 / 8468
5000 / 8468
5100 / 8468
5200 / 8468
5300 / 8468
5400 / 8468
5500 / 8468
5600 / 8468
5700 / 8468
5800 / 8468
5900 / 8468
6000 / 8468
6100 / 8468
6200 / 8468
6300 / 8468
6400 / 8468
6500 / 8468
6600 / 8468
6700 / 8468
6800 / 8468
6900 / 8468
7000 / 8468
7100 / 8468
7200 / 8468
7300 / 8468
7400 / 8468
7500 / 8468
7600 / 8468
7700 / 8468
7800 / 8468
7900 / 8468
8000 / 8468
8100 / 8468
8200 / 8468
8300 / 8468
8400

defaultdict(<function __main__.initialize_rolling_average.<locals>.<lambda>()>,
            {'LAL': defaultdict(<function __main__.initialize_rolling_average.<locals>.<lambda>.<locals>.<lambda>()>,
                         {'games_played': deque([1, 1, 1, 1, 1], maxlen=5),
                          'mp_rolling': deque([240.0,
                                 240.0,
                                 240.0,
                                 240.0,
                                 240.0],
                                maxlen=5),
                          'fg_rolling': deque([47.0, 44.0, 38.0, 47.0, 52.0],
                                maxlen=5),
                          'fga_rolling': deque([90.0, 83.0, 78.0, 92.0, 90.0],
                                maxlen=5),
                          'fg%_rolling': deque([0.522,
                                 0.53,
                                 0.487,
                                 0.511,
                                 0.578],
          

### Dropping Rows

For each team's first 5/10 games of each season, we need to drop its row. This is because there is no data we can use to predict the outcome of the game. Although we could use previous season stats, teams generally change quite a bit between each season. As a result, we made the decision to simply drop these rows.

In [74]:
first_game_indices
df = df.drop(first_game_indices)

In [75]:
df

Unnamed: 0,mp_team0,fg_team0,fga_team0,fg%_team0,3p_team0,3pa_team0,3p%_team0,ft_team0,fta_team0,ft%_team0,...,blk%_rolling_team0,blk%_rolling_team1,tov%_rolling_team0,tov%_rolling_team1,ortg_rolling_team0,ortg_rolling_team1,drtg_rolling_team0,drtg_rolling_team1,ft/fga_rolling_team0,ft/fga_rolling_team1
76,240.0,29.0,77.0,0.377,14.0,38.0,0.368,17.0,21.0,0.810,...,9.54,8.56,14.62,13.88,107.76,106.94,101.62,101.52,0.2350,0.3258
78,240.0,36.0,78.0,0.462,12.0,32.0,0.375,17.0,23.0,0.739,...,9.32,10.30,14.56,14.98,111.52,106.46,110.56,109.54,0.2222,0.2284
79,240.0,37.0,87.0,0.425,9.0,26.0,0.346,24.0,30.0,0.800,...,9.28,10.88,15.82,11.82,97.08,115.18,113.24,100.62,0.2390,0.2568
80,240.0,43.0,87.0,0.494,16.0,35.0,0.457,10.0,14.0,0.714,...,8.72,5.68,15.42,11.76,99.38,100.78,106.62,110.18,0.1702,0.2058
83,240.0,36.0,90.0,0.400,9.0,28.0,0.321,13.0,18.0,0.722,...,13.78,6.66,12.78,12.66,104.44,111.82,101.60,113.66,0.2076,0.2278
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,240.0,45.0,84.0,0.536,17.0,35.0,0.486,12.0,14.0,0.857,...,6.14,12.90,9.52,7.76,123.72,133.44,125.26,111.12,0.2296,0.1402
8464,240.0,41.0,95.0,0.432,13.0,37.0,0.351,7.0,18.0,0.389,...,12.62,8.00,11.72,11.66,104.76,120.48,107.80,124.40,0.1580,0.1348
8465,240.0,38.0,83.0,0.458,13.0,36.0,0.361,18.0,19.0,0.947,...,8.36,7.20,9.82,12.02,106.04,115.26,107.38,118.06,0.1430,0.1680
8466,240.0,45.0,86.0,0.523,13.0,31.0,0.419,13.0,15.0,0.867,...,10.86,8.22,12.54,14.36,117.64,109.26,125.26,120.46,0.2178,0.1768


### Add Back Columns

Certain columns listed in 'exceptions' cannot be averaged throughout the season. As a result, they are added back into the dataframe once the feature engineering is completed.

In [76]:
exceptions = ['team1', 'winner', 'season', 'date', 'team0', 'team0_encoded', 'team1_encoded', 'restDays_team0', 'restDays_team1']

cols_to_keep = [col for col in df.columns if "rolling" in col or col in exceptions]

df_filtered = df[cols_to_keep]

### Determine Team 1 Winner

As mentioned in our presentation, each dataframe possesses a column relating to team1_winner. This will have a value of 1 if team1 wins and a value of 0 if team1 loses.

In [77]:
df['team1_winner'] = df.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)
df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['team1_winner'] = df_filtered.apply(lambda row: 1 if row['winner'] == row['team1'] else 0, axis=1)


In [78]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,restDays_team0,restDays_team1,mp_rolling_team0,...,blk%_rolling_team1,tov%_rolling_team0,tov%_rolling_team1,ortg_rolling_team0,ortg_rolling_team1,drtg_rolling_team0,drtg_rolling_team1,ft/fga_rolling_team0,ft/fga_rolling_team1,team1_winner
76,HOU,MEM,MEM,2018,2017-10-28,29,28,0.0,1.0,240.0,...,8.56,14.62,13.88,107.76,106.94,101.62,101.52,0.2350,0.3258,1
78,CLE,NOP,NOP,2018,2017-10-28,6,30,2.0,1.0,240.0,...,10.30,14.56,14.98,111.52,106.46,110.56,109.54,0.2222,0.2284,1
79,PHO,POR,POR,2018,2017-10-28,24,17,2.0,1.0,240.0,...,10.88,15.82,11.82,97.08,115.18,113.24,100.62,0.2390,0.2568,1
80,PHI,DAL,PHI,2018,2017-10-28,5,27,2.0,1.0,240.0,...,5.68,15.42,11.76,99.38,100.78,106.62,110.18,0.1702,0.2058,0
83,SAS,IND,IND,2018,2017-10-29,26,7,1.0,3.0,240.0,...,6.66,12.78,12.66,104.44,111.82,101.60,113.66,0.2076,0.2278,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,MIL,BOS,BOS,2024,2024-03-20,10,2,2.0,1.0,240.0,...,12.90,9.52,7.76,123.72,133.44,125.26,111.12,0.2296,0.1402,1
8464,PHI,PHO,PHO,2024,2024-03-20,5,24,1.0,2.0,240.0,...,8.00,11.72,11.66,104.76,120.48,107.80,124.40,0.1580,0.1348,1
8465,MIA,CLE,MIA,2024,2024-03-20,11,6,1.0,1.0,240.0,...,7.20,9.82,12.02,106.04,115.26,107.38,118.06,0.1430,0.1680,0
8466,LAC,POR,LAC,2024,2024-03-20,22,17,2.0,1.0,240.0,...,8.22,12.54,14.36,117.64,109.26,125.26,120.46,0.2178,0.1768,0


In [79]:
df_filtered.reset_index(drop=True, inplace=True)

In [80]:
df_filtered

Unnamed: 0,team0,team1,winner,season,date,team0_encoded,team1_encoded,restDays_team0,restDays_team1,mp_rolling_team0,...,blk%_rolling_team1,tov%_rolling_team0,tov%_rolling_team1,ortg_rolling_team0,ortg_rolling_team1,drtg_rolling_team0,drtg_rolling_team1,ft/fga_rolling_team0,ft/fga_rolling_team1,team1_winner
0,HOU,MEM,MEM,2018,2017-10-28,29,28,0.0,1.0,240.0,...,8.56,14.62,13.88,107.76,106.94,101.62,101.52,0.2350,0.3258,1
1,CLE,NOP,NOP,2018,2017-10-28,6,30,2.0,1.0,240.0,...,10.30,14.56,14.98,111.52,106.46,110.56,109.54,0.2222,0.2284,1
2,PHO,POR,POR,2018,2017-10-28,24,17,2.0,1.0,240.0,...,10.88,15.82,11.82,97.08,115.18,113.24,100.62,0.2390,0.2568,1
3,PHI,DAL,PHI,2018,2017-10-28,5,27,2.0,1.0,240.0,...,5.68,15.42,11.76,99.38,100.78,106.62,110.18,0.1702,0.2058,0
4,SAS,IND,IND,2018,2017-10-29,26,7,1.0,3.0,240.0,...,6.66,12.78,12.66,104.44,111.82,101.60,113.66,0.2076,0.2278,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7903,MIL,BOS,BOS,2024,2024-03-20,10,2,2.0,1.0,240.0,...,12.90,9.52,7.76,123.72,133.44,125.26,111.12,0.2296,0.1402,1
7904,PHI,PHO,PHO,2024,2024-03-20,5,24,1.0,2.0,240.0,...,8.00,11.72,11.66,104.76,120.48,107.80,124.40,0.1580,0.1348,1
7905,MIA,CLE,MIA,2024,2024-03-20,11,6,1.0,1.0,240.0,...,7.20,9.82,12.02,106.04,115.26,107.38,118.06,0.1430,0.1680,0
7906,LAC,POR,LAC,2024,2024-03-20,22,17,2.0,1.0,240.0,...,8.22,12.54,14.36,117.64,109.26,125.26,120.46,0.2178,0.1768,0


In [82]:
#save_df(df_filtered, "rolling_average_10_day.csv")
save_df(df_filtered, "rolling_average_5_day.csv")