In [107]:
import pandas as pd
import numpy as np
import pprint
import os
import datetime

## Research Question
Can we predict game outcomes using all past seasons' team statistics?
For instance, if teams A and B are playing, can we use all the past seasons' statistics for team A and B to predict who will win?

#### Loading Each Dataset

In [123]:
# loading each dataset into a dictionary
datasets = {}
for file_name in os.listdir('archive'):
    if file_name.endswith('.csv'):
        datasets[file_name.split('.')[0]] = pd.read_csv('archive/' + file_name, low_memory=False)

list(datasets.keys())

['teams', 'players', 'games', 'ranking', 'games_details']

#### Cleaning and Organizing


In [124]:
# only keeping the team id, abbreviation, nickname, city, and arena capacity for each team in the teams dataset
# replacing NaN and 0 values with the mean
datasets['teams'] = datasets['teams'][['TEAM_ID', 'ABBREVIATION', 'NICKNAME', 'CITY', 'ARENACAPACITY']].fillna(0).replace(0, datasets['teams']["ARENACAPACITY"].mean().round(0))
datasets['teams'].head()

Unnamed: 0,TEAM_ID,ABBREVIATION,NICKNAME,CITY,ARENACAPACITY
0,1610612737,ATL,Hawks,Atlanta,18729.0
1,1610612738,BOS,Celtics,Boston,18624.0
2,1610612740,NOP,Pelicans,New Orleans,18553.0
3,1610612741,CHI,Bulls,Chicago,21711.0
4,1610612742,DAL,Mavericks,Dallas,19200.0


In [125]:
datasets['players'].sample(5)

Unnamed: 0,PLAYER_NAME,TEAM_ID,PLAYER_ID,SEASON
4655,Courtney Lee,1610612738,201584,2013
1154,Mike Muscala,1610612755,203488,2018
1820,Justin Patton,1610612750,1628383,2017
1900,Kristaps Porzingis,1610612752,204001,2017
2624,Jonathan Gibson,1610612742,1626780,2016


In [126]:
datasets['game_details'] = datasets['games_details'][["GAME_ID", "TEAM_ID", "MIN", "FG_PCT", "FG3_PCT", "FT_PCT", 
                                                    "OREB", "DREB", "REB", "AST", "STL", "BLK", "TO", "PF", "PTS", "PLUS_MINUS"]].dropna()
datasets['game_details']['MIN'] = datasets['game_details']['MIN'].str.split(':').apply(lambda x: float(x[0]) + float(x[1])/60).round(3)
datasets['game_details'].head()

Unnamed: 0,GAME_ID,TEAM_ID,MIN,FG_PCT,FG3_PCT,FT_PCT,OREB,DREB,REB,AST,STL,BLK,TO,PF,PTS,PLUS_MINUS
0,22200477,1610612759,18.1,1.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,0.0,2.0,5.0,2.0,-2.0
1,22200477,1610612759,31.017,0.5,0.5,0.7,6.0,3.0,9.0,6.0,1.0,0.0,2.0,1.0,23.0,-14.0
2,22200477,1610612759,21.7,0.667,0.0,1.0,1.0,3.0,4.0,1.0,1.0,0.0,2.0,4.0,13.0,-4.0
3,22200477,1610612759,30.333,0.308,0.167,1.0,0.0,9.0,9.0,5.0,3.0,0.0,2.0,1.0,10.0,-18.0
4,22200477,1610612759,27.733,0.583,0.333,1.0,0.0,2.0,2.0,3.0,0.0,0.0,2.0,2.0,19.0,0.0


In [127]:
datasets['ranking'] = datasets['ranking'][["TEAM_ID", "SEASON_ID", "STANDINGSDATE", "CONFERENCE", "G", "W_PCT"]]
datasets['ranking']["STANDINGSDATE"] = pd.to_datetime(datasets['ranking']["STANDINGSDATE"])
datasets['ranking'].head(5)


Unnamed: 0,TEAM_ID,SEASON_ID,STANDINGSDATE,CONFERENCE,G,W_PCT
0,1610612743,22022,2022-12-22,West,30,0.633
1,1610612763,22022,2022-12-22,West,30,0.633
2,1610612740,22022,2022-12-22,West,31,0.613
3,1610612756,22022,2022-12-22,West,32,0.594
4,1610612746,22022,2022-12-22,West,33,0.576


In [129]:
datasets['games'] = datasets['games'].drop(['GAME_STATUS_TEXT', "TEAM_ID_home", "TEAM_ID_away"], axis=1)
datasets['games']["GAME_DATE_EST"] = pd.to_datetime(datasets['games']["GAME_DATE_EST"])
datasets['games'].head()

TypeError: unhashable type: 'list'