In [1]:
# UNCOMMENT IF RUNNING FIRST TIME: !pip install basketball_reference_web_scraper

In [2]:
# installing an external client to scrap basketball_reference website
from basketball_reference_web_scraper import client
from basketball_reference_web_scraper.data import OutputType

In [3]:
import pandas as pd
import numpy as np

In [4]:
# Get advanced season statistics for all players, from the 1951-1952 season to the 2018-2019 season (68 seasons).

# UNCOMMENT IF RUNNING FIRST TIME:
# for i in list(range(1952,2020)):
#    i_year_csv = 'season_end_year_' + str(i) + '.csv'
#    client.players_advanced_season_totals(season_end_year=i, output_type= OutputType.CSV, output_file_path= str(i_year_csv))

In [5]:
# Data Cleaning
# Running a loop to:
# 1) Keeping and renaming the essential columns: playerID, name, position, age, PER, usage%, games played
# 2) Creating a new column for the year (to be used later on to determine the first season played by each player)
# 3) Saving the data for each season into a dataframe, and collating all the dataframes into a dictionary (season_data)

season_data={}
for i in list(range(1952,2020)):

    i_year_csv = 'season_end_year_' + str(i) + '.csv'
    
    df = pd.read_csv(str(i_year_csv))
    df = df[['slug', 'name', 'positions', 'age','player_efficiency_rating','usage_percentage', 'games_played']]
    df = df.rename(columns={'slug': 'playerID', 'positions': 'position', 'player_efficiency_rating': 'PER', 'usage_percentage': 'usage%', 'games_played': 'gp'})
    df['yr'] = int(i)
    season_data[i] = df


In [6]:
# Previewing the data
season_data[2004].head()

Unnamed: 0,playerID,name,position,age,PER,usage%,gp,yr
0,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,21.2,24.8,53,2004
1,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,16.5,23.3,32,2004
2,allenma01,Malik Allen,POWER FORWARD,25,10.5,18.3,45,2004
3,allenra02,Ray Allen,SHOOTING GUARD,28,21.7,27.8,56,2004
4,alstora01,Rafer Alston,POINT GUARD,27,13.7,17.3,82,2004


In [7]:
# Data Cleaning
# Running a loop to ensure there is only one set of player data for each season
# (duplicates were a result of player transferring teams mid-season)
for i in list(range(1952,2020)):
    season_data[i] = season_data[i].groupby('playerID').agg({'name':'first', 'position': 'first', 'age': 'first', 'PER': 'mean', 'usage%': 'mean', 'gp': 'sum', 'yr': 'first'}).reset_index()

season_data[2004].head()

Unnamed: 0,playerID,name,position,age,PER,usage%,gp,yr
0,abdursh01,Shareef Abdur-Rahim,POWER FORWARD,27,18.85,24.05,85,2004
1,allenma01,Malik Allen,POWER FORWARD,25,10.5,18.3,45,2004
2,allenra02,Ray Allen,SHOOTING GUARD,28,21.7,27.8,56,2004
3,alstora01,Rafer Alston,POINT GUARD,27,13.7,17.3,82,2004
4,anderch01,Chris Andersen,POWER FORWARD,25,14.4,12.7,71,2004


In [8]:
# Methodology - Identifying each player's peak using PER
# First, creating an empty dataframe for each of the players with data
player_data = {}
for i in list(range(1952,2020)):
    for index, row in season_data[i].iterrows():
        player_data[row['name']] = pd.DataFrame(columns=['playerID', 'name', 'position', 'age', 'PER', 'usage%', 'gp', 'yr'])

In [9]:
# Methodology
# Second, appending each season's data to the dataframe as a row, based on player name
for i in list(range(1952,2020)):
    for index, row in season_data[i].iterrows():
        player_data[row['name']] = player_data[row['name']].append(row, ignore_index = True)

In [10]:
# Previewing the data (no. of players + sample of 1 player's data)
print(len(player_data))
player_data['Kareem Abdul-Jabbar']

3996


Unnamed: 0,playerID,name,position,age,PER,usage%,gp,yr
0,abdulka01,Kareem Abdul-Jabbar,CENTER,22,22.5,0.0,82,1970
1,abdulka01,Kareem Abdul-Jabbar,CENTER,23,29.0,0.0,82,1971
2,abdulka01,Kareem Abdul-Jabbar,CENTER,24,29.9,0.0,81,1972
3,abdulka01,Kareem Abdul-Jabbar,CENTER,25,28.5,0.0,76,1973
4,abdulka01,Kareem Abdul-Jabbar,CENTER,26,24.4,0.0,81,1974
5,abdulka01,Kareem Abdul-Jabbar,CENTER,27,26.4,0.0,65,1975
6,abdulka01,Kareem Abdul-Jabbar,CENTER,28,27.2,0.0,82,1976
7,abdulka01,Kareem Abdul-Jabbar,CENTER,29,27.8,0.0,82,1977
8,abdulka01,Kareem Abdul-Jabbar,CENTER,30,29.2,27.0,62,1978
9,abdulka01,Kareem Abdul-Jabbar,CENTER,31,25.5,23.3,80,1979


In [17]:
# Data Cleaning
# To ensure proper identification of a player's peak, scoping to players with more than 5 years in the league.
player_data1 = {}
for key,value in player_data.items():
    if len(player_data[key].index) > 5:
        player_data1[key] = player_data[key]
        
# To maintain fairness and reliability of results,
# only looking at seasons in which players played at least half of the full season (41 games).
for key,value in player_data1.items():
    player_data1[key] = player_data1[key][player_data1[key]['gp']>=41]

In [18]:
# Previewing the data (no. of players)
# Reduced to 1506
print(len(player_data1))

1506


In [19]:
# Methodology
# Third, sorting each season by PER (descending), and keeping the year with the highest PER
# Using the year column to identify which was the player's first year in the league,
# and saving it as the player's draft (first) year.
player_data_topPER = {}
for key,value in player_data1.items():
    player_data_topPER[key] = player_data1[key].sort_values(['PER'], ascending = False).head(1)
    player_data_topPER[key]['draftyr'] = player_data1[key]['yr'].min()
    
# Previewing the data
player_data_topPER['Kareem Abdul-Jabbar']

Unnamed: 0,playerID,name,position,age,PER,usage%,gp,yr,draftyr
2,abdulka01,Kareem Abdul-Jabbar,CENTER,24,29.9,0.0,81,1972,1970


In [20]:
# Methodology
# Fourth, concatenating all the peak years into one dataframe for further analysis

peak_year_players = pd.DataFrame(columns=['playerID', 'name', 'position', 'age', 'PER', 'usage%', 'gp', 'yr', 'draftyr'])
for key,value in player_data_topPER.items():
    peak_year_players = pd.concat([peak_year_players,player_data_topPER[key]])

In [21]:
# Data Cleaning
# To maintain fairness and reliability of results,
# only looking at seasons in which players played at least half of the full season (41 games).
peak_year_players = peak_year_players[peak_year_players['gp']>=41]

# Previewing the data (no. of players)
# Reduced to 1294
peak_year_players.shape

(1503, 9)

In [22]:
peak_year_players.to_csv('Peak Year Players.csv', index = False)