# Do Great Players Make their Teammates Better?

## Load Libraries and Data

In [16]:
# load libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# load data
df_2017 = pd.read_csv("data/2017-2018 NBA Play-By-Play Data.csv", low_memory=False)
df_2018 = pd.read_csv("data/2018-2019 NBA Play-By-Play Data.csv", low_memory=False)
df_2019 = pd.read_csv("data/2019-2020 NBA Play-By-Play Data.csv", low_memory=False)

In [3]:
# concatenate data into one dataframe
df = pd.concat([df_2017, df_2018, df_2019], ignore_index=True)

## Inspect Data

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,game_id,data_set,date,a1,a2,a3,a4,a5,h1,...,type,shot_distance,original_x,original_y,converted_x,converted_y,description,GameFile,Unnamed: 44,Unnamed: 45
0,0,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,start of period,,,,,,,[2018-04-11]-0021701224-DET@CHI.csv,,
1,1,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,jump ball,,,,,,Jump Ball Felicio vs. Moreland: Tip to Markkanen,[2018-04-11]-0021701224-DET@CHI.csv,,
2,2,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,unknown,6.0,9.0,58.0,25.9,83.2,Nwaba 6' Driving Floating Jump Shot (2 PTS) (M...,[2018-04-11]-0021701224-DET@CHI.csv,,
3,3,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,unknown,,,,,,Jackson Out of Bounds - Bad Pass Turnover Turn...,[2018-04-11]-0021701224-DET@CHI.csv,,
4,4,"=""0021701224""",2017-2018 Regular Season,2018-04-11,Stanley Johnson,Anthony Tolliver,Eric Moreland,Luke Kennard,Reggie Jackson,David Nwaba,...,Jump Shot,27.0,123.0,243.0,37.3,64.7,MISS Markkanen 27' 3PT Jump Shot,[2018-04-11]-0021701224-DET@CHI.csv,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764069 entries, 0 to 1764068
Data columns (total 48 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Unnamed: 0      int64  
 1   game_id         object 
 2   data_set        object 
 3   date            object 
 4   a1              object 
 5   a2              object 
 6   a3              object 
 7   a4              object 
 8   a5              object 
 9   h1              object 
 10  h2              object 
 11  h3              object 
 12  h4              object 
 13  h5              object 
 14  period          int64  
 15  away_score      int64  
 16  home_score      int64  
 17  remaining_time  object 
 18  elapsed         object 
 19  play_length     object 
 20  play_id         int64  
 21  team            object 
 22  event_type      object 
 23  assist          object 
 24  away            object 
 25  home            object 
 26  block           object 
 27  entered         object 
 28  left        

## Data Cleaning

In [6]:
# remove nonsensical columns
df.drop(columns=['Unnamed: 0', 'Unnamed: 44', 'Unnamed: 45'], inplace=True)

In [7]:
# handle missing values
df.fillna({
    'points': 0,
    'assist': 'None'
}, inplace=True)

In [8]:
# convert columns to correct data types
df['remaining_time_seconds'] = pd.to_timedelta(df['remaining_time'].str.strip(), errors='coerce').dt.total_seconds()
df['elapsed'] = pd.to_numeric(df['elapsed'], errors='coerce')
df['play_length_seconds'] = pd.to_timedelta(df['play_length'].str.strip(), errors='coerce').dt.total_seconds()
df['points'] = pd.to_numeric(df['points'], errors='coerce').astype(int)
df['shot_distance'] = pd.to_numeric(df['shot_distance'], errors='coerce')
df['converted_x'] = pd.to_numeric(df['converted_x'], errors='coerce')
df['converted_y'] = pd.to_numeric(df['converted_y'], errors='coerce')

In [9]:
# remove rows where there is no value in the event_type field (1,010 instances)
df.dropna(subset=['event_type'], inplace=True)

## Identify Top Players

In [10]:
# identify top players
n=5
top_players_df = (
    df.groupby('player', as_index=False)['points']
      .sum()
      .sort_values(by='points', ascending=False).reset_index(drop=True)
).head(n)

top_players_df.index = range(1, n+1)

top_players_df

Unnamed: 0,player,points
1,James Harden,8533
2,LeBron James,6782
3,Giannis Antetokounmpo,6668
4,Damian Lillard,6639
5,Anthony Davis,6033


In [11]:
top_players_list = top_players_df['player'].tolist()
top_players_list

['James Harden',
 'LeBron James',
 'Giannis Antetokounmpo',
 'Damian Lillard',
 'Anthony Davis']

## Top Player Effects

In [None]:
# some of the top players have played for multiple teams
for top_player in top_players_list:
    teams_scored_for = df.loc[
        (df['player'] == top_player) & (df['points'] > 0), 'team'
    ].dropna().str.strip().unique()
    
    print(f"{top_player} scored for: {teams_scored_for}")

James Harden scored for: ['HOU']
LeBron James scored for: ['CLE' 'LAL']
Giannis Antetokounmpo scored for: ['MIL']
Damian Lillard scored for: ['POR']
Anthony Davis scored for: ['NOP' 'LAL']


In [None]:
results = []
players_on_court = ['a1', 'a2', 'a3', 'a4', 'a5', 'h1', 'h2', 'h3', 'h4', 'h5']

for top_player in top_players_list:
    # determine what teams the player has played for
    teams_scored_for = df.loc[
        (df['player'] == top_player) & (df['points'] > 0), 'team'
    ].dropna().str.strip().unique()
    
    # create a mask for plays when the player is on the court
    mask_active = df[players_on_court].apply(lambda row: top_player in row.values, axis=1)
    
    for team in teams_scored_for:
        team_mask = df['team'].str.strip() == team
        
        # filter for plays that belong to the team
        team_active_plays = df[mask_active & team_mask]
        team_inactive_plays = df[(~mask_active) & team_mask]
        
        # calculate metrics
        active_points = team_active_plays['points'].sum()
        active_minutes = team_active_plays['play_length_seconds'].sum() / 60
        
        inactive_points = team_inactive_plays['points'].sum()
        inactive_minutes = team_inactive_plays['play_length_seconds'].sum() / 60
        
        active_ppm = active_points / active_minutes if active_minutes > 0 else np.nan
        inactive_ppm = inactive_points / inactive_minutes if inactive_minutes > 0 else np.nan
        
        # append the player's metrics to a list
        results.append({
            'player': top_player,
            'team': team,
            'active_ppm': active_ppm,
            'inactive_ppm': inactive_ppm,
            'ppm_difference': active_ppm - inactive_ppm,
            'active_minutes': active_minutes,
            'active_points': active_points,
            'inactive_minutes': inactive_minutes,
            'inactive_points': inactive_points
        })

# create a dataframe
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,player,team,active_ppm,inactive_ppm,ppm_difference,active_minutes,active_points,inactive_minutes,inactive_points
0,James Harden,HOU,4.852789,4.71328,0.139509,4594.883333,22298,1910.983333,9007
1,LeBron James,CLE,4.632699,4.305898,0.326802,1985.883333,9200,4094.616667,17631
2,LeBron James,LAL,4.85597,4.618728,0.237242,2488.483333,12084,3543.4,16366
3,Giannis Antetokounmpo,MIL,5.016517,4.673362,0.343156,3849.483333,19311,2414.75,11285
4,Damian Lillard,POR,4.692194,4.196565,0.49563,4556.716667,21381,1912.516667,8026
5,Anthony Davis,NOP,5.041511,4.789626,0.251885,2376.866667,11983,3335.333333,15975
6,Anthony Davis,LAL,4.793584,4.690642,0.102942,1521.2,7292,4510.683333,21158
