# NBA Game Prediction: Analysis
Written by Nicholas Cannon

In [4]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

GAMES_DIR = os.path.join('..', 'data', 'processed', 'games')
TEAMS_DIR = os.path.join('..', 'data', 'processed', 'teams')

## Individual Season Analysis

In [2]:
def season_analysis(path):
    df = pd.read_csv(path)
    win_details = df['HOME_WIN'].value_counts()
    
    print('-' * 60)
    print('DF Analysis: {}'.format(path))
    print('-' * 60)
    print('SHAPE =', df.shape)
    print('NaNs = ', df.isnull().values.any())
    print('Home wins = {}, Away wins = {}'.format(win_details[1], win_details[0]))
    print('Avg home points scored (rounded) = ', df['H_PTS'].mean().astype(int))
    print('Avg away points scored (rounded) = ', df['A_PTS'].mean().astype(int))
    print('-' * 60, end='\n\n')

for season in os.listdir(GAMES_DIR):
    season_analysis(os.path.join(GAMES_DIR, season))

------------------------------------------------------------
DF Analysis: ../data/processed/games/22015-full.csv
------------------------------------------------------------
SHAPE = (1230, 92)
NaNs =  False
Home wins = 724, Away wins = 506
Avg home points scored (rounded) =  104
Avg away points scored (rounded) =  101
------------------------------------------------------------

------------------------------------------------------------
DF Analysis: ../data/processed/games/22014-full.csv
------------------------------------------------------------
SHAPE = (1230, 92)
NaNs =  False
Home wins = 707, Away wins = 523
Avg home points scored (rounded) =  101
Avg away points scored (rounded) =  98
------------------------------------------------------------

------------------------------------------------------------
DF Analysis: ../data/processed/games/22018-full.csv
------------------------------------------------------------
SHAPE = (1230, 92)
NaNs =  False
Home wins = 729, Away wins = 5

## Combined Season Analysis

In [10]:
df = pd.DataFrame()
for season in os.listdir(GAMES_DIR):
    df = pd.concat([df, pd.read_csv(os.path.join(GAMES_DIR, season), dtype={'GAME_ID': str})], ignore_index=True)
    
df['HOME_WIN'].value_counts()
# X_AXIS = 'H_REB'
# Y_AXIS = 'A_REB'
# groups = df.groupby('HOME_WIN')
# for name, group in groups:
#     plt.plot(group[X_AXIS], group[Y_AXIS], linestyle='', marker='o', label='Home win' if name else 'Away win')

# plt.xlabel(X_AXIS)
# plt.ylabel(Y_AXIS)
# plt.legend()
# plt.plot()

1    4304
0    3076
Name: HOME_WIN, dtype: int64

In [4]:
game = pd.read_csv(os.path.join(TEAMS_DIR, '1610612752.csv'), dtype={'GAME_ID': str})
game.shape

(492, 55)

In [1]:
# game.iloc[0] # home KNICKS

In [13]:
game.iloc[0]['SEASON']

2013

In [23]:
# Validate labels
df = pd.DataFrame()

for team_id in os.listdir(TEAMS_DIR):
    team_df = pd.read_csv(os.path.join(TEAMS_DIR, team_id))

    df = pd.concat([df, team_df], ignore_index=True)

for game_id, data in df.groupby('GAME_ID'):
    if data.iloc[0]['WON'] == data.iloc[1]['WON']:
        print('Errored at', idx, game_id)

df.shape

(14760, 55)