In [25]:
import pandas as pd
import numpy as np
import cbbpy.mens_scraper as s

from sklearn.preprocessing import MinMaxScaler

In [26]:
# game_info_df, boxscore_df, pbp_df = s.get_games_season(2024)

In [27]:
try:
    game_info_df = pd.read_csv('raw_data/game_info_df.csv')
    boxscore_df = pd.read_csv('raw_data/boxscore_df.csv')
    pbp_df = pd.read_csv('raw_data/pbp_df.csv')
except:
    game_info_df, boxscore_df, pbp_df = s.get_games_season(2024)
    game_info_df.to_csv('game_info_df.csv',index=False)
    boxscore_df.to_csv('boxscore_df.csv',index=False)
    pbp_df.to_csv('pbp_df.csv',index=False)


## Summary

Assumptions:
* Refs, networks, arena capacity, date, time of day and tournament do not impact score

## Games

### Filter

In [28]:
game_info_df.shape

(5117, 27)

In [29]:
game_info_df.head()

Unnamed: 0,game_id,home_team,home_id,home_rank,home_record,home_score,away_team,away_id,away_rank,away_record,...,game_day,game_time,game_loc,arena,arena_capacity,attendance,tv_network,referee_1,referee_2,referee_3
0,401575451,Kansas Jayhawks,2305,1.0,1-0,99,North Carolina Central Eagles,2428,,0-1,...,"November 06, 2023",05:00 PM PST,"Lawrence, KS",Allen Fieldhouse,0,16300,BIG12|ESPN+,Gerry Pollard,Amy Bonner,Chance Moore
1,401576147,Duke Blue Devils,150,2.0,1-0,92,Dartmouth Big Green,159,,0-1,...,"November 06, 2023",06:00 PM PST,"Durham, NC",Cameron Indoor Stadium,0,9314,ACCN,Jamie Luckie,Ryan Sassano,Clare Aubry
2,401577640,Purdue Boilermakers,2509,3.0,1-0,98,Samford Bulldogs,2535,,0-1,...,"November 06, 2023",03:30 PM PST,"West Lafayette, IN",Mackey Arena,0,14876,BTN,D.J. Carstensen,Edwin Young,Jourdan Love
3,401581835,Michigan State Spartans,127,4.0,0-1,76,James Madison Dukes,256,,1-0,...,"November 06, 2023",05:30 PM PST,"East Lansing, MI",Breslin Center,0,14797,BTN,Brian Dorsey,Chad Barlow,Brian McNutt
4,401577598,Marquette Golden Eagles,269,5.0,1-0,92,Northern Illinois Huskies,2459,,0-1,...,"November 06, 2023",05:30 PM PST,"Milwaukee, WI",Fiserv Forum,0,16352,,Lamar Simpson,Peter Larson,Ed Corliss


In [30]:
game_info_df.columns

Index(['game_id', 'home_team', 'home_id', 'home_rank', 'home_record',
       'home_score', 'away_team', 'away_id', 'away_rank', 'away_record',
       'away_score', 'home_win', 'num_ots', 'is_conference', 'is_neutral',
       'is_postseason', 'tournament', 'game_day', 'game_time', 'game_loc',
       'arena', 'arena_capacity', 'attendance', 'tv_network', 'referee_1',
       'referee_2', 'referee_3'],
      dtype='object')

In [31]:
print('Shape before filtering:', game_info_df.shape)
filtered_df = game_info_df[game_info_df['is_postseason']==False]
# filtered_df = filtered_df[(game_info_df['home_record']!='') & (game_info_df['away_record']!='')]
filtered_df = filtered_df.dropna(subset=['home_record','away_record'])
print('Shape after filtering:', filtered_df.shape)

Shape before filtering: (5117, 27)
Shape after filtering: (4607, 27)


In [32]:
exclude_cols = ['home_win','home_score', 'away_score', 'home_id', 'away_id','num_ots', 'is_postseason', 'tournament', 'game_time', 'game_loc',
       'arena', 'arena_capacity', 'attendance', 'tv_network', 'referee_1',
       'referee_2', 'referee_3']
       
date_col = 'game_day'
onehot_cols = ['home_team','away_team']
bool_cols = ['is_conference','is_neutral']
rank_cols = ['home_rank','away_rank']
records_cols = ['home_record','away_record']
# target_cols = ['home_score','away_score']


### Transform

In [33]:
# Exclude Cols
transformed_df = filtered_df.drop(exclude_cols, axis=1)

# Date col
transformed_df['game_day'] = pd.to_datetime(transformed_df['game_day'])

# One Hot Encode
# transformed_df = pd.get_dummies(transformed_df, columns=onehot_cols, prefix = ['home','away'], dtype=int)

# Convert Bool to Int
for col in bool_cols:
    transformed_df[col] = transformed_df[col].astype(int)

# Replace NULL Ranks with 99
for col in rank_cols:
    transformed_df[col] = transformed_df[col].fillna(99)

# Conver Record cols to Wins and Losses cols
for col in records_cols:
    team_type = col.split('_')[0]
    transformed_df[col] = transformed_df[col].astype(str)
    wins = transformed_df.loc[:,col].apply(lambda x: x.split('-')[0])
    losses = transformed_df.loc[:,col].apply(lambda x: x.split('-')[1])    
    transformed_df[team_type+'Team_wins'] = wins
    transformed_df[team_type+'Team_losses'] = losses
    transformed_df = transformed_df.drop(col,axis=1)

print(transformed_df.shape)

(4607, 12)


### Transformed Preview

In [34]:
transformed_df.head()

Unnamed: 0,game_id,home_team,home_rank,away_team,away_rank,is_conference,is_neutral,game_day,homeTeam_wins,homeTeam_losses,awayTeam_wins,awayTeam_losses
0,401575451,Kansas Jayhawks,1.0,North Carolina Central Eagles,99.0,0,0,2023-11-06,1,0,0,1
1,401576147,Duke Blue Devils,2.0,Dartmouth Big Green,99.0,0,0,2023-11-06,1,0,0,1
2,401577640,Purdue Boilermakers,3.0,Samford Bulldogs,99.0,0,0,2023-11-06,1,0,0,1
3,401581835,Michigan State Spartans,4.0,James Madison Dukes,99.0,0,0,2023-11-06,0,1,1,0
4,401577598,Marquette Golden Eagles,5.0,Northern Illinois Huskies,99.0,0,0,2023-11-06,1,0,0,1


## Boxscores

In [35]:
#TODO: convert home and away teams to vectors in transformed df?
#TODO; convert boxscores to vectors
    # TODO: find top 7-8 players by min played and find avg stats using those players and not whole team

In [36]:
boxscore_df.columns

Index(['game_id', 'team', 'player', 'player_id', 'position', 'starter', 'min',
       'fgm', 'fga', '2pm', '2pa', '3pm', '3pa', 'ftm', 'fta', 'oreb', 'dreb',
       'reb', 'ast', 'stl', 'blk', 'to', 'pf', 'pts'],
      dtype='object')

In [37]:
stat_cols = ['fgm', 'fga', '2pm', '2pa', '3pm', '3pa', 'ftm', 'fta', 'oreb', 'dreb',
       'reb', 'ast', 'stl', 'blk', 'to', 'pf']
exclude_boxscore_cols = ['player', 'player_id', 'position', 'starter', 'min']


### Filter to Boxscore Totals

In [38]:
boxscores_filtered = boxscore_df[(boxscore_df.player_id == 'TOTAL')]

### Normalize Boxscore Stats

In [39]:
# Normalize Stats
scaler = MinMaxScaler()
boxscores_filtered[stat_cols] = scaler.fit_transform(boxscores_filtered[stat_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  boxscores_filtered[stat_cols] = scaler.fit_transform(boxscores_filtered[stat_cols])


In [40]:
boxscores_filtered.head()

Unnamed: 0,game_id,team,player,player_id,position,starter,min,fgm,fga,2pm,...,fta,oreb,dreb,reb,ast,stl,blk,to,pf,pts
12,401575451,North Carolina Central Eagles,TEAM,TOTAL,TOTAL,False,,0.363636,0.558824,0.285714,...,0.352941,0.272727,0.25,0.2875,0.189189,0.304348,0.055556,0.257143,0.325,56
25,401575451,Kansas Jayhawks,TEAM,TOTAL,TOTAL,False,,0.709091,0.54902,0.530612,...,0.196078,0.121212,0.589286,0.4625,0.918919,0.173913,0.166667,0.285714,0.325,99
41,401576147,Dartmouth Big Green,TEAM,TOTAL,TOTAL,False,,0.345455,0.54902,0.306122,...,0.333333,0.30303,0.339286,0.3625,0.297297,0.217391,0.055556,0.371429,0.4,54
54,401576147,Duke Blue Devils,TEAM,TOTAL,TOTAL,False,,0.690909,0.578431,0.632653,...,0.235294,0.151515,0.5,0.4125,0.486486,0.26087,0.333333,0.257143,0.45,92
68,401577640,Samford Bulldogs,TEAM,TOTAL,TOTAL,False,,0.309091,0.666667,0.244898,...,0.27451,0.424242,0.339286,0.4125,0.297297,0.478261,0.0,0.428571,0.475,45


### Convert Stat Columns into One stats_vector

In [41]:
# Get top 8 guys by minutes. Convert their stats into matrices. For input when predicting, will need to find averages and fill in each value
# OR just get their totals?
# boxscores_filtered['stats_vector'] = list(np.array(boxscores_filtered.loc[:,stat_cols]))
# bs_v = boxscores_filtered.drop(stat_cols+exclude_boxscore_cols,axis=1).reset_index(drop=True)
# bs_v
# test_boxscore.assign(new=pd.factorize(df.))

# Ignore vectors for now... 
bs_v = boxscores_filtered.drop(exclude_boxscore_cols,axis=1).reset_index(drop=True)
bs=bs_v.copy()

In [42]:
bs.shape

(10234, 19)

In [58]:
# df_rolling  = df_rolling.groupby(['team'], group_keys=False).apply(find_team_averages)
# Use cumulative instead of rolling
def find_team_averages(team):
    rolling = team.expanding().mean()
    return rolling

def shift_score(team):
    team['prev_scores_cum_avg'] = team['score'].expanding().mean()
    team['prev_scores_cum_avg'] = team['prev_scores_cum_avg'].shift(1)
    return team


In [44]:
pd.DataFrame(data=[('a',1),('a',1),('a',2),('a',1),('a',1),('a',2),('a',1),('a',1),('a',2)],columns=['team','x'])

Unnamed: 0,team,x
0,a,1
1,a,1
2,a,2
3,a,1
4,a,1
5,a,2
6,a,1
7,a,1
8,a,2


In [45]:
a = pd.DataFrame(data=[('a',3),('a',1),('a',2),('a',5),('a',5),('b',2),('b',1),('b',1),('b',2)],columns=['team','x']).groupby('team',group_keys=False).expanding().mean() 
a.groupby('team',group_keys=False)['x'].shift(1)

team   
a     0         NaN
      1    3.000000
      2    2.000000
      3    2.000000
      4    2.750000
b     5         NaN
      6    2.000000
      7    1.500000
      8    1.333333
Name: x, dtype: float64

In [46]:
# bs_grouped = bs.groupby(['team'], group_keys=False)[stat_cols].apply(find_team_averages)
# rolling_cols = [f'{col}_cumul' for col in bs_grouped.columns]
# bs_grouped.columns = rolling_cols

In [47]:
# include_cols = ['game_id','team','pts']+rolling_cols
# bs_grouped_with_game_info = pd.concat([bs_v,bs_grouped],axis=1)#[include_cols]
# bs_grouped_with_game_info.head()

Unnamed: 0,game_id,team,fgm,fga,2pm,2pa,3pm,3pa,ftm,fta,...,ftm_cumul,fta_cumul,oreb_cumul,dreb_cumul,reb_cumul,ast_cumul,stl_cumul,blk_cumul,to_cumul,pf_cumul
0,401575451,North Carolina Central Eagles,0.363636,0.558824,0.285714,0.453333,0.272727,0.442308,0.25,0.352941,...,0.25,0.352941,0.272727,0.25,0.2875,0.189189,0.304348,0.055556,0.257143,0.325
1,401575451,Kansas Jayhawks,0.709091,0.54902,0.530612,0.44,0.590909,0.442308,0.2,0.196078,...,0.2,0.196078,0.121212,0.589286,0.4625,0.918919,0.173913,0.166667,0.285714,0.325
2,401576147,Dartmouth Big Green,0.345455,0.54902,0.306122,0.506667,0.181818,0.346154,0.3,0.333333,...,0.3,0.333333,0.30303,0.339286,0.3625,0.297297,0.217391,0.055556,0.371429,0.4
3,401576147,Duke Blue Devils,0.690909,0.578431,0.632653,0.56,0.318182,0.326923,0.225,0.235294,...,0.225,0.235294,0.151515,0.5,0.4125,0.486486,0.26087,0.333333,0.257143,0.45
4,401577640,Samford Bulldogs,0.309091,0.666667,0.244898,0.493333,0.227273,0.596154,0.15,0.27451,...,0.15,0.27451,0.424242,0.339286,0.4125,0.297297,0.478261,0.0,0.428571,0.475


In [80]:
# Join in transformed on game ID and the teams

include_cols = ['game_id','team','pts']+stat_cols
home_boxscores_df = transformed_df.rename(columns={'home_team':'team'})
home_boxscores_df['home_team']=home_boxscores_df['team']

away_boxscores_df = transformed_df.rename(columns={'away_team':'team'})
away_boxscores_df['away_team']=away_boxscores_df['team']

home_boxscores = pd.merge(bs, home_boxscores_df, how='left', on=['game_id','team'], validate='one_to_one').dropna()
# home_boxscores = home_boxscores.drop(['team'],axis=1)
away_boxscores = pd.merge(bs, away_boxscores_df, how='left', on=['game_id','team'], validate='one_to_one').dropna()[include_cols]
# combined_boxscores = pd.merge(home_boxscores,away_boxscores,on='game_id',validate='one_to_one',suffixes=('_home','_away'))
combined_boxscores = pd.merge(home_boxscores,away_boxscores,on='game_id',validate='one_to_one',suffixes=('_home','_away'))
combined_boxscores

Unnamed: 0,game_id,team_home,fgm_home,fga_home,2pm_home,2pa_home,3pm_home,3pa_home,ftm_home,fta_home,...,ftm_away,fta_away,oreb_away,dreb_away,reb_away,ast_away,stl_away,blk_away,to_away,pf_away
0,401575451,Kansas Jayhawks,0.709091,0.549020,0.530612,0.440000,0.590909,0.442308,0.200,0.196078,...,0.250,0.352941,0.272727,0.250000,0.2875,0.189189,0.304348,0.055556,0.257143,0.325
1,401576147,Duke Blue Devils,0.690909,0.578431,0.632653,0.560000,0.318182,0.326923,0.225,0.235294,...,0.300,0.333333,0.303030,0.339286,0.3625,0.297297,0.217391,0.055556,0.371429,0.400
2,401577640,Purdue Boilermakers,0.618182,0.607843,0.367347,0.440000,0.727273,0.557692,0.350,0.431373,...,0.150,0.274510,0.424242,0.339286,0.4125,0.297297,0.478261,0.000000,0.428571,0.475
3,401581835,Michigan State Spartans,0.472727,0.705882,0.510204,0.693333,0.045455,0.384615,0.575,0.725490,...,0.525,0.549020,0.424242,0.660714,0.6375,0.378378,0.217391,0.222222,0.428571,0.800
4,401577598,Marquette Golden Eagles,0.636364,0.656863,0.530612,0.560000,0.409091,0.480769,0.325,0.411765,...,0.275,0.313725,0.545455,0.392857,0.5000,0.270270,0.260870,0.222222,0.400000,0.325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4602,401577676,Columbia Lions,0.472727,0.539216,0.469388,0.480000,0.136364,0.365385,0.225,0.294118,...,0.275,0.313725,0.333333,0.464286,0.4625,0.405405,0.347826,0.277778,0.514286,0.350
4603,401587749,Milwaukee Panthers,0.472727,0.715686,0.326531,0.520000,0.454545,0.653846,0.450,0.411765,...,0.375,0.372549,0.606061,0.500000,0.6000,0.432432,0.347826,0.444444,0.285714,0.475
4604,401596915,Fordham Rams,0.490909,0.509804,0.387755,0.440000,0.363636,0.365385,0.425,0.470588,...,0.575,0.529412,0.303030,0.321429,0.3500,0.270270,0.521739,0.222222,0.314286,0.500
4605,401594303,Quinnipiac Bobcats,0.563636,0.647059,0.326531,0.440000,0.681818,0.634615,0.100,0.117647,...,0.400,0.392157,0.272727,0.446429,0.4250,0.378378,0.434783,0.277778,0.257143,0.275


In [81]:
combined_boxscores.columns

Index(['game_id', 'team_home', 'fgm_home', 'fga_home', '2pm_home', '2pa_home',
       '3pm_home', '3pa_home', 'ftm_home', 'fta_home', 'oreb_home',
       'dreb_home', 'reb_home', 'ast_home', 'stl_home', 'blk_home', 'to_home',
       'pf_home', 'pts_home', 'home_rank', 'away_team', 'away_rank',
       'is_conference', 'is_neutral', 'game_day', 'homeTeam_wins',
       'homeTeam_losses', 'awayTeam_wins', 'awayTeam_losses', 'home_team',
       'team_away', 'pts_away', 'fgm_away', 'fga_away', '2pm_away', '2pa_away',
       '3pm_away', '3pa_away', 'ftm_away', 'fta_away', 'oreb_away',
       'dreb_away', 'reb_away', 'ast_away', 'stl_away', 'blk_away', 'to_away',
       'pf_away'],
      dtype='object')

In [82]:
combined_boxscores.loc[combined_boxscores['game_id']==401575451,:]

Unnamed: 0,game_id,team_home,fgm_home,fga_home,2pm_home,2pa_home,3pm_home,3pa_home,ftm_home,fta_home,...,ftm_away,fta_away,oreb_away,dreb_away,reb_away,ast_away,stl_away,blk_away,to_away,pf_away
0,401575451,Kansas Jayhawks,0.709091,0.54902,0.530612,0.44,0.590909,0.442308,0.2,0.196078,...,0.25,0.352941,0.272727,0.25,0.2875,0.189189,0.304348,0.055556,0.257143,0.325


In [83]:
combined_boxscores.to_csv('processed_data/combined_boxscores.csv',index=False)

In [49]:
# combined_boxscores.loc[combined_boxscores['game_id']==401606133,['fgm_rolling_home','away_team','team_away','home_team','team_home','pts_home','pts_away']]

KeyError: "['fgm_rolling_home'] not in index"

In [60]:
# new_bs = combined_boxscores.copy()
team_cols = ['away_team','team_away','home_team','team_home','pts_home','pts_away']
home_rows = []
away_rows = []
for i, row in combined_boxscores.iterrows():    
    row['is_home']=1
    row['team']=row['home_team']
    row['score_other'] = row['pts_away']
    row['score']=row['pts_home']
    home_rows.append(row.copy())

    row['is_home']=0
    row['team']=row['away_team']
    row['score_other'] = row['pts_home']
    row['score']=row['pts_away']
    away_rows.append(row.copy())

final_df = pd.concat([pd.DataFrame(home_rows), pd.DataFrame(away_rows)])
final_df = final_df.drop(team_cols,axis=1)
    


In [61]:
final_df.columns

Index(['game_id', 'fgm', 'fga', '2pm', '2pa', '3pm', '3pa', 'ftm', 'fta',
       'oreb', 'dreb', 'reb', 'ast', 'stl', 'blk', 'to', 'pf',
       'fgm_cumul_home', 'fga_cumul_home', '2pm_cumul_home', '2pa_cumul_home',
       '3pm_cumul_home', '3pa_cumul_home', 'ftm_cumul_home', 'fta_cumul_home',
       'oreb_cumul_home', 'dreb_cumul_home', 'reb_cumul_home',
       'ast_cumul_home', 'stl_cumul_home', 'blk_cumul_home', 'to_cumul_home',
       'pf_cumul_home', 'home_rank', 'away_rank', 'is_conference',
       'is_neutral', 'game_day', 'homeTeam_wins', 'homeTeam_losses',
       'awayTeam_wins', 'awayTeam_losses', 'fgm_cumul_away', 'fga_cumul_away',
       '2pm_cumul_away', '2pa_cumul_away', '3pm_cumul_away', '3pa_cumul_away',
       'ftm_cumul_away', 'fta_cumul_away', 'oreb_cumul_away',
       'dreb_cumul_away', 'reb_cumul_away', 'ast_cumul_away', 'stl_cumul_away',
       'blk_cumul_away', 'to_cumul_away', 'pf_cumul_away', 'is_home', 'team',
       'score_other', 'score'],
      dtype='objec

In [62]:
final_df.loc[final_df['team']=='Kansas Jayhawks',['game_day','game_id','score_other','score']].sort_values(by='game_day')

Unnamed: 0,game_day,game_id,score_other,score
0,2023-11-06,401575451,56,99
170,2023-11-10,401575452,61,99
346,2023-11-14,401575453,84,89
709,2023-11-21,401581595,73,59
758,2023-11-22,401581599,60,69
1021,2023-11-28,401575454,63,71
1146,2023-12-01,401574563,65,69
1304,2023-12-05,401575455,69,88
1422,2023-12-09,401575456,64,73
1624,2023-12-16,401575457,71,75


In [63]:
final_df = final_df.groupby('team',group_keys=False).apply(shift_score)

  final_df = final_df.groupby('team',group_keys=False).apply(shift_score)


In [64]:
final_df.loc[final_df['team']=='Kansas Jayhawks',['game_day','game_id','score_other','score','prev_scores_cum_avg']].sort_values(by='game_day')

Unnamed: 0,game_day,game_id,score_other,score,prev_scores_cum_avg
0,2023-11-06,401575451,56,99,
170,2023-11-10,401575452,61,99,99.0
346,2023-11-14,401575453,84,89,99.0
709,2023-11-21,401581595,73,59,79.875
758,2023-11-22,401581599,60,69,95.666667
1021,2023-11-28,401575454,63,71,89.0
1146,2023-12-01,401574563,65,69,85.4
1304,2023-12-05,401575455,69,88,82.666667
1422,2023-12-09,401575456,64,73,83.428571
1624,2023-12-16,401575457,71,75,78.647059


In [None]:
final_df.head()

Unnamed: 0,game_id,fgm,fga,2pm,2pa,3pm,3pa,ftm,fta,oreb,...,stl_rolling_away,blk_rolling_away,to_rolling_away,pf_rolling_away,is_home,team,score_other,score,target,other_team_score
0,401602630,0.472727,0.568627,0.530612,0.653333,0.0,0.173077,0.625,0.627451,0.454545,...,0.26087,0.166667,0.342857,0.516667,1,Baylor Bears,62,77,99.0,61.0
0,401602630,0.472727,0.568627,0.530612,0.653333,0.0,0.173077,0.625,0.627451,0.454545,...,0.26087,0.166667,0.342857,0.516667,0,Gardner-Webb Runnin' Bulldogs,77,62,52.0,59.0
1,401600448,0.345455,0.598039,0.306122,0.52,0.181818,0.423077,0.6,0.666667,0.575758,...,0.318841,0.425926,0.342857,0.45,0,Bryant Bulldogs,66,57,79.0,95.0
1,401600448,0.345455,0.598039,0.306122,0.52,0.181818,0.423077,0.6,0.666667,0.575758,...,0.318841,0.425926,0.342857,0.45,1,Rutgers Scarlet Knights,57,66,71.0,60.0
2,401593971,0.509091,0.588235,0.408163,0.52,0.363636,0.403846,0.525,0.607843,0.212121,...,0.246377,0.055556,0.4,0.6,1,Cincinnati Bearcats,73,85,90.0,66.0


### Save Data

In [65]:
final_df.to_csv('processed_data/data.csv',index=False)