In [267]:
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt

import numpy as np
from sklearn.preprocessing import LabelEncoder

In [184]:
# Show all cols in df.head()
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Revert to some cols
# pd.reset_option('display.max_columns')
# pd.reset_option('display.width')

### Load Dataset

In [346]:
games = pd.read_csv('data/game2.csv', on_bad_lines='skip')
games['game_date'] = pd.to_datetime(games['gmDate'])
games['year'] = games['game_date'].dt.year
games = games.sort_values(by='game_date')
games.drop(['gmDate'], axis=1, inplace=True)

# change col names for team/oppt to away/home
for col in games.columns:
	if col[:4] == 'team':
		games.rename(columns={col: 'away_' + (col[4:]).lower()}, inplace=True)
	elif col[:4] == 'oppt':
		games.rename(columns={col: 'home_' + (col[4:]).lower()}, inplace=True)

# add 'winner' columns to indicate if home team won
games['winner'] = (games['matchWinner'] == games['home_abbr']).astype(int)

# set time min 1957-58
games = games[games['year'] >= 1957]
games = games[games['season'] != '1956-57']

games = games.drop_duplicates()

games.reset_index(drop=True, inplace=True)

In [347]:
# Basic Preprocessing: Encoding
season_le = LabelEncoder()
games['seasonType'] = season_le.fit_transform(games['seasonType'])

home = games['home_abbr'].unique()
away = games['away_abbr'].unique()
abbr = np.concatenate((home, away), axis=0)
unique = list(set(abbr))

teams_le = LabelEncoder()
teams_le.fit(unique)
games['home_abbr'] = teams_le.transform(games['home_abbr'])
games['away_abbr'] = teams_le.transform(games['away_abbr'])

In [348]:
# drop columns
games.drop(['season', 'away_loc', 'away_rslt', 'home_loc', 'home_rslt', 'matchWinner',
			'game_date', 'year'], axis=1, inplace=True)

### Past 10 games

In [None]:
last_home_games = np.full((games.shape[0], 10, games.shape[1]), np.nan)
last_away_games = np.full((games.shape[0], 10, games.shape[1]), np.nan)
labels = np.full((games.shape[0], 1), np.nan)

for index, row in games.iterrows():
    prev = games.loc[:index]

    labels[index] = row['winner']
    
    # get home team last 10 games
    home_team = row['home_abbr']
    prev_home = prev[(prev['home_abbr'] == home_team) | (prev['away_abbr'] == home_team)]
    prev_home = prev_home.shift(1).tail(10)
    prev_home.drop('winner', axis=1)
    prev_home = prev_home.to_numpy()

    if prev_home.shape[0] < 10:
        temp = np.full((10, games.shape[1]), np.nan)
        temp[10 - prev_home.shape[0]:] = prev_home
        prev_home = temp
    
    # get away team last 10 games
    away_team = row['away_abbr']
    prev_away = prev[(prev['home_abbr'] == away_team) | (prev['away_abbr'] == away_team)]
    prev_away = prev_away.shift(1).tail(10)
    prev_away.drop('winner', axis=1)
    prev_away = prev_away.to_numpy()

    if prev_away.shape[0] < 10:
        temp = np.full((10, games.shape[1]), np.nan)
        temp[10 - prev_away.shape[0]:] = prev_away
        prev_away = temp
    
    last_home_games[index] = prev_home
    last_away_games[index] = prev_away
    
    print("\rIndex " + str(index) + "/" + str(games.index[-1]), end='')

Index 69933/69933
Last row with nan values: 61380


In [None]:
# mask = np.zeros((games.shape[0], ))

# for seq in range(last_home_games.shape[0]):
# 	if np.isnan(last_home_games[seq]).any() or np.isnan(last_away_games[seq]).any():
# 		mask[seq] = 1

# last_home_games_test = last_home_games[mask == 0]
# last_away_games_test = last_away_games[mask == 0]
# labels_test = labels[mask == 0]

In [None]:
# np.save("data/data_np/home_last10_X.npy", last_home_games_test)
# np.save("data/data_np/away_last10_X.npy", last_away_games_test)
# np.save("data/data_np/last10_labels.npy", last_home_games_test)

### Past 3 Matchups

In [410]:
last_matchups = np.full((games.shape[0], 3, games.shape[1]), np.nan)
labels2 = np.full((games.shape[0], 1), np.nan)

for index, row in games.iterrows():
    prev = games.loc[:index]
    labels2[index] = row['winner']
    
    # get home team last 10 matchups
    home_team = row['home_abbr']
    away_team = row['away_abbr']
    prev_match = prev[((prev['home_abbr'] == home_team) & (prev['away_abbr'] == away_team) |
                        (prev['away_abbr'] == home_team) & (prev['home_abbr'] == away_team))]
    prev_match = prev_match.shift(1).tail(3)
    prev_match.drop('winner', axis=1)
    prev_match = prev_match.to_numpy()

    if prev_match.shape[0] < 3:
        temp = np.full((3, games.shape[1]), np.nan)
        temp[3 - prev_match.shape[0]:] = prev_match
        prev_match = temp

    last_matchups[index] = prev_match

    print("\rIndex " + str(index) + "/" + str(games.index[-1]), end='')

Index 69933/69933

In [None]:
# mask2 = np.zeros((games.shape[0], ))

# for seq in range(last_matchups.shape[0]):
# 	if np.isnan(last_matchups[seq]).any():
# 		mask2[seq] = 1

# last_matchups_test = last_matchups[mask2 == 0]
# labels2_test = labels2[mask2 == 0]

### Summary Stats

In [None]:
# seasonal_games['home_seasonal_win_pct'] = np.nan
# seasonal_games['away_seasonal_win_pct'] = np.nan
# seasonal_games['away_seasonal_pt_diff'] = np.nan
# seasonal_games['home_seasonal_pt_diff'] = np.nan
# seasonal_games['home_seasonal_home_pct'] = np.nan
# seasonal_games['away_seasonal_home_pct'] = np.nan
# seasonal_games['home_seasonal_away_pct'] = np.nan
# seasonal_games['away_seasonal_away_pct'] = np.nan

# count = 0

# teams = seasonal_games['team_abbreviation_home'].unique()
# teams = list(set(seasonal_games['team_abbreviation_home'].unique()) | set(seasonal_games['team_abbreviation_away'].unique()))

# for team in teams:
#     # team_games contains every game team played in, sorted by date
#     team_games = seasonal_games[(seasonal_games["team_abbreviation_home"] == team) | (seasonal_games["team_abbreviation_away"] == team)].sort_values(by='game_date')

#     # home
#     team_games['team_home_wins'] = team_games[team_games["team_abbreviation_home"] == team]['wl_home']
#     team_games['team_home_wins'] = team_games['team_home_wins'].fillna(0)

#     team_games['team_seasonal_home_pct'] = team_games.groupby('season_id')['team_home_wins'].expanding().mean().reset_index(level=0, drop=True).shift(1)
#     team_games.loc[team_games.groupby('season_id')['team_home_wins'].head(1).index, 'team_seasonal_home_pct'] = 0.0

#     team_games['team_seasonal_home_pct'] = round(team_games['team_seasonal_home_pct'], 5)

#     team_games.loc[team_games['team_abbreviation_home'] == team, 'home_seasonal_home_pct'] = team_games['team_seasonal_home_pct']
#     team_games.loc[team_games['team_abbreviation_away'] == team, 'away_seasonal_home_pct'] = team_games['team_seasonal_home_pct']
#     team_games.drop(['team_home_wins', 'team_seasonal_home_pct'], axis=1, inplace=True)

#     # away
#     team_games['team_away_wins'] = team_games[team_games["team_abbreviation_away"] == team]['wl_away']
#     team_games['team_away_wins'] = team_games['team_away_wins'].fillna(0)

#     team_games['team_seasonal_away_pct'] = team_games.groupby('season_id')['team_away_wins'].expanding().mean().reset_index(level=0, drop=True).shift(1)
#     team_games.loc[team_games.groupby('season_id')['team_away_wins'].head(1).index, 'team_seasonal_away_pct'] = 0.0

#     team_games['team_seasonal_away_pct'] = round(team_games['team_seasonal_away_pct'], 5)

#     team_games.loc[team_games['team_abbreviation_home'] == team, 'home_seasonal_away_pct'] = team_games['team_seasonal_away_pct']
#     team_games.loc[team_games['team_abbreviation_away'] == team, 'away_seasonal_away_pct'] = team_games['team_seasonal_away_pct']
#     team_games.drop(['team_away_wins', 'team_seasonal_away_pct'], axis=1, inplace=True)

#     # add cumulative mean of wins (w/l) throughout each season
#     team_games['team_wins'] = team_games[team_games["team_abbreviation_home"] == team]['wl_home']
#     team_games['team_wins'] = team_games['team_wins'].fillna(team_games[team_games["team_abbreviation_away"] == team]['wl_away'])

#     team_games['team_seasonal_win_pct'] = team_games.groupby('season_id')['team_wins'].expanding().mean().reset_index(level=0, drop=True).shift(1)
#     team_games.loc[team_games.groupby('season_id')['team_wins'].head(1).index, 'team_seasonal_win_pct'] = 0.0

#     team_games['team_seasonal_win_pct'] = round(team_games['team_seasonal_win_pct'], 5)

#     team_games.loc[team_games['team_abbreviation_home'] == team, 'home_seasonal_win_pct'] = team_games['team_seasonal_win_pct']
#     team_games.loc[team_games['team_abbreviation_away'] == team, 'away_seasonal_win_pct'] = team_games['team_seasonal_win_pct']
#     team_games.drop(['team_wins', 'team_seasonal_win_pct'], axis=1, inplace=True)

#     # add cumulative mean of pts scored throughout each season
#     team_games['team_pts'] = team_games[team_games["team_abbreviation_home"] == team]['pts_home']
#     team_games['team_pts'] = team_games['team_pts'].fillna(team_games[team_games["team_abbreviation_away"] == team]['pts_away'])

#     team_games['opp_pts'] = team_games[team_games["team_abbreviation_home"] == team]['pts_away']
#     team_games['opp_pts'] = team_games['opp_pts'].fillna(team_games[team_games["team_abbreviation_away"] == team]['pts_home'])

#     team_games['pt_diff'] = team_games['team_pts'] - team_games['opp_pts']

#     team_games['team_seasonal_pt_diff'] = team_games.groupby('season_id')['pt_diff'].expanding().mean().reset_index(level=0, drop=True).shift(1)
#     team_games.loc[team_games.groupby('season_id')['team_pts'].head(1).index, 'team_seasonal_pt_diff'] = 0.0

#     team_games['team_seasonal_pt_diff'] = round(team_games['team_seasonal_pt_diff'], 5)

#     team_games.loc[team_games['team_abbreviation_home'] == team, 'home_seasonal_pt_diff'] = team_games['team_seasonal_pt_diff']
#     team_games.loc[team_games['team_abbreviation_away'] == team, 'away_seasonal_pt_diff'] = team_games['team_seasonal_pt_diff']
#     team_games.drop(['team_pts', 'opp_pts', 'pt_diff', 'team_seasonal_pt_diff'], axis=1, inplace=True)

#     seasonal_games = seasonal_games.combine_first(team_games)

#     count += 1
#     print("\rTeam " + str(count) + "/" + str(len(teams)), end='')

# seasonal_games = seasonal_games[['game_date', 'team_name_home', 'team_name_away', 'season_type', 'season_id',
#                                  'year', 'month', 'day', 'home_seasonal_win_pct', 'away_seasonal_win_pct',
#                                  'home_seasonal_pt_diff', 'away_seasonal_pt_diff', 'home_seasonal_home_pct',
#                                  'away_seasonal_home_pct', 'home_seasonal_away_pct', 'away_seasonal_away_pct']]

# print(seasonal_games.shape)


### Save Numpy Datasets

In [413]:
mask = np.zeros((games.shape[0], ))

for seq in range(last_home_games.shape[0]):
	if np.isnan(last_home_games[seq]).any() or np.isnan(last_away_games[seq]).any():
		mask[seq] = 1

mask2 = np.zeros((games.shape[0], ))

for seq in range(last_matchups.shape[0]):
	if np.isnan(last_matchups[seq]).any():
		mask2[seq] = 1

mask_combined = mask + mask2

In [None]:
last_matchups_test = last_matchups[mask_combined == 0]

last_home_games_test = last_home_games[mask_combined == 0]
last_away_games_test = last_away_games[mask_combined == 0]
labels_test = labels[mask_combined == 0]

In [421]:
np.save("data/data_np/home_last10_X.npy", last_home_games_test)
np.save("data/data_np/away_last10_X.npy", last_away_games_test)
np.save("data/data_np/last10_labels.npy", labels_test)

np.save("data/data_np/matchups_last3_X.npy", last_matchups_test)