In [46]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


In [47]:
# Reading in the CSV files
season2018_19 = pd.read_csv('NBA_Box_Scores_2018_19.csv')
season2019_20 = pd.read_csv('NBA_Box_Scores_2019_20.csv')
season2020_21 = pd.read_csv('NBA_Box_Scores_2020_21.csv')
season2021_22 = pd.read_csv('NBA_Box_Scores_2021_22.csv')
season2022_23 = pd.read_csv('NBA_Box_Scores_2022_23.csv')

In [48]:
# Checking column datatypes
season2018_19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2460 entries, 0 to 2459
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TEAM       2460 non-null   object 
 1   MATCH UP   2460 non-null   object 
 2   GAME DATE  2460 non-null   object 
 3   W/L        2460 non-null   object 
 4   MIN        2460 non-null   int64  
 5   PTS        2460 non-null   int64  
 6   FGM        2460 non-null   int64  
 7   FGA        2460 non-null   int64  
 8   FG%        2460 non-null   float64
 9   3:00 PM    2460 non-null   int64  
 10  3PA        2460 non-null   int64  
 11  3P%        2460 non-null   float64
 12  FTM        2460 non-null   int64  
 13  FTA        2460 non-null   int64  
 14  FT%        2460 non-null   float64
 15  OREB       2460 non-null   int64  
 16  DREB       2460 non-null   int64  
 17  REB        2460 non-null   int64  
 18  AST        2460 non-null   int64  
 19  STL        2460 non-null   int64  
 20  BLK     

# Data Preprocessing

In [49]:
# Fixing column name
season2018_19.rename({'3:00 PM':'3PM', '"+/-"':'plus_minus'}, axis=1, inplace=True)
season2019_20.rename({'3:00 PM':'3PM', '"+/-"':'plus_minus'}, axis=1, inplace=True)
season2020_21.rename({'3:00 PM':'3PM', '"+/-"':'plus_minus'}, axis=1, inplace=True)
season2021_22.rename({'3:00 PM':'3PM', '"+/-"':'plus_minus'}, axis=1, inplace=True)
season2022_23.rename({'3:00 PM':'3PM', '"+/-"':'plus_minus'}, axis=1, inplace=True)

# Changing Date column to datetime 
season2018_19['GAME DATE'] = season2018_19['GAME DATE'].astype('datetime64[ns]')
season2019_20['GAME DATE'] = season2019_20['GAME DATE'].astype('datetime64[ns]')
season2020_21['GAME DATE'] = season2020_21['GAME DATE'].astype('datetime64[ns]')
season2021_22['GAME DATE'] = season2021_22['GAME DATE'].astype('datetime64[ns]')
season2022_23['GAME DATE'] = season2022_23['GAME DATE'].astype('datetime64[ns]')

In [50]:
season2019_20.head(25)

Unnamed: 0,TEAM,MATCH UP,GAME DATE,W/L,MIN,PTS,FGM,FGA,FG%,3PM,...,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,PF,plus_minus
0,HOU,HOU vs. PHI,2020-08-14,L,240,96,35,80,43.8,12,...,87.5,1,26,27,25,10,9,17,18,-38
1,PHI,PHI @ HOU,2020-08-14,W,240,134,49,87,56.3,18,...,81.8,8,43,51,31,8,4,16,19,38
2,IND,IND vs. MIA,2020-08-14,W,240,109,43,89,48.3,15,...,72.7,12,39,51,32,11,3,20,27,17
3,MIA,MIA @ IND,2020-08-14,L,240,92,31,87,35.6,10,...,74.1,14,34,48,14,6,5,19,15,-17
4,LAC,LAC vs. OKC,2020-08-14,W,265,107,34,85,40.0,12,...,69.2,12,49,61,23,10,2,23,22,4
5,OKC,OKC @ LAC,2020-08-14,L,265,103,38,106,35.8,14,...,59.1,11,37,48,18,5,6,16,29,-4
6,TOR,TOR vs. DEN,2020-08-14,W,240,117,45,90,50.0,18,...,64.3,13,38,51,27,8,5,19,24,8
7,DEN,DEN @ TOR,2020-08-14,L,240,109,36,87,41.4,16,...,91.3,9,32,41,27,9,11,16,15,-8
8,DAL,DAL @ PHX,2020-08-13,L,240,102,39,89,43.8,8,...,84.2,11,32,43,17,3,4,11,18,-26
9,MEM,MEM vs. MIL,2020-08-13,W,240,119,45,90,50.0,13,...,69.6,9,46,55,36,4,3,18,18,13


In [51]:
# For a single game between 2 teams, there are 2 rows that exists: one for the home team and one for the away team
# We want to make this into one row

def fixing_dataframe(df):
    # Filtering for home and away games
    away_games = df[df['MATCH UP'].str.contains('@')]
    home_games = df[~df['MATCH UP'].str.contains('@')]

    # Getting a list of the desired columns
    desired_columns = list(home_games.columns[3:])
    
    # Formatting the MATCH UP column of the away games so that it matches the home games
    teams = away_games[['MATCH UP', 'GAME DATE']]
    teams['away'] = teams['MATCH UP'].apply(lambda x: x[0:3])
    teams['home'] = teams['MATCH UP'].apply(lambda x: x[6:])
    teams['MATCH UP FIXED'] = teams['home'] + " vs. " + teams['away']

    fixed_away_games = away_games.merge(teams, on=['MATCH UP', 'GAME DATE'], how='inner')
    
    # Renaming columns that contain stats to indicate whether it's from the home or away team
    for name in desired_columns:
        fixed_away_games.rename({name:name+'_away'}, axis = 1, inplace=True)
        home_games.rename({name:name+'_home'}, axis = 1, inplace=True)

    # Creating the final dataframe
    final = fixed_away_games.merge(home_games, left_on=['MATCH UP FIXED', 'GAME DATE'], right_on=['MATCH UP', 'GAME DATE'], how='inner')
    final.rename({'MATCH_UP_x':'MATCH UP'} ,axis=1, inplace=True)
    
    # Only choosing the columns that we want
    return final[['MATCH UP_x', 'home', 'away','GAME DATE', 'W/L_away', 'MIN_away', 'PTS_away',
       'FGM_away', 'FGA_away', 'FG%_away', '3PM_away', '3PA_away', '3P%_away',
       'FTM_away', 'FTA_away', 'FT%_away', 'OREB_away', 'DREB_away',
       'REB_away', 'AST_away', 'STL_away', 'BLK_away', 'TOV_away', 'PF_away',
       'plus_minus_away', 'W/L_home', 'MIN_home', 'PTS_home', 'FGM_home',
       'FGA_home', 'FG%_home', '3PM_home', '3PA_home', '3P%_home', 'FTM_home',
       'FTA_home', 'FT%_home', 'OREB_home', 'DREB_home', 'REB_home',
       'AST_home', 'STL_home', 'BLK_home', 'TOV_home', 'PF_home',
       'plus_minus_home']]

In [52]:
season2018_19 = fixing_dataframe(season2018_19)
season2019_20 = fixing_dataframe(season2019_20)
season2020_21 = fixing_dataframe(season2020_21)
season2021_22 = fixing_dataframe(season2021_22)
season2022_23 = fixing_dataframe(season2022_23)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams['away'] = teams['MATCH UP'].apply(lambda x: x[0:3])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams['home'] = teams['MATCH UP'].apply(lambda x: x[6:])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  teams['MATCH UP FIXED'] = teams['home'] + " vs. " + teams['away']
A value is trying to be 

In [53]:
season2022_23

Unnamed: 0,MATCH UP_x,home,away,GAME DATE,W/L_away,MIN_away,PTS_away,FGM_away,FGA_away,FG%_away,...,FT%_home,OREB_home,DREB_home,REB_home,AST_home,STL_home,BLK_home,TOV_home,PF_home,plus_minus_home
0,MIL @ TOR,TOR,MIL,2023-04-09,L,240,105,38,82,46.3,...,87.5,7,38,45,29,8,2,9,19,16
1,SAC @ DEN,DEN,SAC,2023-04-09,L,240,95,37,85,43.5,...,72.0,15,36,51,25,11,2,20,15,14
2,MEM @ OKC,OKC,MEM,2023-04-09,L,240,100,40,92,43.5,...,81.3,8,41,49,30,5,2,16,17,15
3,ATL @ BOS,BOS,ATL,2023-04-09,L,240,114,44,97,45.4,...,78.6,12,35,47,33,5,3,15,13,6
4,NOP @ MIN,MIN,NOP,2023-04-09,L,240,108,40,90,44.4,...,71.0,8,34,42,25,7,8,17,22,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,HOU @ ATL,ATL,HOU,2022-10-19,L,240,107,42,98,42.9,...,83.3,4,34,38,30,12,5,9,18,10
1226,NOP @ BKN,BKN,NOP,2022-10-19,W,240,130,50,102,49.0,...,75.0,9,30,39,22,10,10,16,25,-22
1227,POR @ SAC,SAC,POR,2022-10-19,W,240,115,39,88,44.3,...,68.4,4,37,41,27,8,5,16,25,-7
1228,LAL @ GSW,GSW,LAL,2022-10-18,L,240,109,40,94,42.6,...,73.9,11,37,48,31,11,4,18,23,14
