# Do Great Players Make their Teammates Better?

## Load Libraries and Data

In [32]:
# load libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [33]:
# load data
df_2017 = pd.read_csv("data/2017-2018 NBA Play-By-Play Data.csv", low_memory=False)
df_2018 = pd.read_csv("data/2018-2019 NBA Play-By-Play Data.csv", low_memory=False)
df_2019 = pd.read_csv("data/2019-2020 NBA Play-By-Play Data.csv", low_memory=False)

In [34]:
# concatenate data into one dataframe
df = pd.concat([df_2017, df_2018, df_2019], ignore_index=True)

## Inspect Data

In [35]:
df.head()

Unnamed: 0.1,Unnamed: 0,game_id,data_set,date,a1,a2,a3,a4,a5,h1,...,type,shot_distance,original_x,original_y,converted_x,converted_y,description,GameFile,Unnamed: 44,Unnamed: 45
0,0,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,start of period,,,,,,,[2018-04-11]-0021701224-DET@CHI.csv,,
1,1,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,jump ball,,,,,,Jump Ball Felicio vs. Moreland: Tip to Markkanen,[2018-04-11]-0021701224-DET@CHI.csv,,
2,2,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,unknown,6.0,9.0,58.0,25.9,83.2,Nwaba 6' Driving Floating Jump Shot (2 PTS) (M...,[2018-04-11]-0021701224-DET@CHI.csv,,
3,3,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,unknown,,,,,,Jackson Out of Bounds - Bad Pass Turnover Turn...,[2018-04-11]-0021701224-DET@CHI.csv,,
4,4,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,Jump Shot,27.0,123.0,243.0,37.3,64.7,MISS Markkanen 27' 3PT Jump Shot,[2018-04-11]-0021701224-DET@CHI.csv,,


In [36]:
print("columns:\n", df.columns)

columns:
 Index(['Unnamed: 0', 'game_id', 'data_set', 'date', 'a1', 'a2', 'a3', 'a4',
       'a5', 'h1', 'h2', 'h3', 'h4', 'h5', 'period', 'away_score',
       'home_score', 'remaining_time', 'elapsed', 'play_length', 'play_id',
       'team', 'event_type', 'assist', 'away', 'home', 'block', 'entered',
       'left', 'num', 'opponent', 'outof', 'player', 'points', 'possession',
       'reason', 'result', 'steal', 'type', 'shot_distance', 'original_x',
       'original_y', 'converted_x', 'converted_y', 'description', 'GameFile',
       'Unnamed: 44', 'Unnamed: 45'],
      dtype='object')


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764069 entries, 0 to 1764068
Data columns (total 48 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Unnamed: 0      int64  
 1   game_id         object 
 2   data_set        object 
 3   date            object 
 4   a1              object 
 5   a2              object 
 6   a3              object 
 7   a4              object 
 8   a5              object 
 9   h1              object 
 10  h2              object 
 11  h3              object 
 12  h4              object 
 13  h5              object 
 14  period          int64  
 15  away_score      int64  
 16  home_score      int64  
 17  remaining_time  object 
 18  elapsed         object 
 19  play_length     object 
 20  play_id         int64  
 21  team            object 
 22  event_type      object 
 23  assist          object 
 24  away            object 
 25  home            object 
 26  block           object 
 27  entered         object 
 28  left        

## Data Cleaning

In [38]:
# remove nonsensical columns
df.drop(columns=['Unnamed: 0', 'Unnamed: 44', 'Unnamed: 45'], inplace=True)

In [39]:
# handle missing values
df.fillna({
    'points': 0,
    'assist': 'None'
}, inplace=True)

In [40]:
# convert columns to correct data types
df['shot_distance'] = pd.to_numeric(df['shot_distance'], errors='coerce')
df['elapsed'] = pd.to_numeric(df['elapsed'], errors='coerce')
df['play_length'] = pd.to_numeric(df['play_length'], errors='coerce')
df['remaining_time_seconds'] = pd.to_timedelta("00:" + df['remaining_time'].str.strip(), errors='coerce').dt.total_seconds()
df['converted_x'] = pd.to_numeric(df['converted_x'], errors='coerce')
df['converted_y'] = pd.to_numeric(df['converted_y'], errors='coerce')

In [None]:
# remove rows where there is no value in the event_type field (1,010 instances)
df.dropna(subset=['event_type'], inplace=True)

## Identify Top Players