# Data preparation
This notebook prepares the data for subsequent machine learning applications. 

It outputs a csv file of clean machine learning-ready data for each award, containing merged player, team and vote shares data.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import requests
import warnings
warnings.filterwarnings('ignore')
from IPython.display import clear_output

# Set working path
path = '/Users/martinbogaert/Desktop/NBA Data Analysis/2022-2023 Awards Project clean/'

## Merge player and team data

In [3]:
# Save final data
final_data = pd.read_csv(path + 'Scrapping/player_data.csv').merge(pd.read_csv(path + 'Scrapping/team_data.csv'), how = 'outer', on = ['Tm', 'Year'])
#final_data.to_csv(path + 'final_data.csv', index = None)

## MVP

In [3]:
# Merge final data with MVP shares data
mvp = final_data.merge(pd.read_csv(path + 'Scrapping/mvp_history.csv'), how = 'outer', on = ['Player', 'Year'])
mvp = mvp[:-1] # George Johnson (SAS, 1981) not matched with any row in final_data.csv (0.004 shares)

In [4]:
# Mimimum games and minutes requirements, keep players with shares eitherway
mvp = mvp[(mvp['G'] >= 20) | (mvp['Share'] > 0)] # Minimum 20 games played
mvp = mvp[(mvp['MP'] >= 10) | (mvp['Share'] > 0)] # Minimum 10 minutes per game

for stat in list(mvp):
    mvp[stat] = mvp[stat].replace(np.nan, 0)
    
mvp.to_csv(path + 'Algorithm/mvp/mvp_data.csv', index = False)

## ROY

In [5]:
# Function to scrape rookie player and respective year
def rookie_history(yr):

    url = f'https://www.basketball-reference.com/leagues/NBA_{yr}_rookies.html'
    page = requests.get(url)
    
   # with open(f'roy/yearly rookie data/{yr}.html', 'w+') as file:
   #         file.write(page.text)
    #        file.close()

    soup = BeautifulSoup(page.content, 'html')

    for overhead in ['over_header', 'over_header thead', 'thead']:
        while soup.find('tr', class_ = overhead) is not None:
            soup.find('tr', class_ = overhead).decompose()

    table = soup.find('table')

    # Assemble rookie history DataFrame
    rookies = pd.read_html(str(table))[0][['Player']]
    rookies['Year'] = len(rookies) * [yr]

    return rookies

In [6]:
years = list(range(1978, 2023))
dfs = []
for yr in years:
    dfs.append(rookie_history(yr))
    clear_output(wait = True)
    print(str(yr) + ' / 2022')
    
rookie_data = pd.concat(dfs)
rookie_data['Player'] = rookie_data['Player'].str.replace('*', '', regex = False)

2022 / 2022


In [7]:
roy = final_data.merge(pd.read_csv(path + 'Scrapping/roy_history.csv'), how = 'outer', on = ['Player', 'Year'])
roy = roy.merge(rookie_data, how = 'inner', on = ['Player', 'Year'])

In [8]:
# Mimimum games and minutes requirements, keep players with shares eitherway
roy = roy[(roy['G'] >= 15) | (roy['Share'] > 0)] # Minimum 15 games played
roy = roy[(roy['MP'] >= 8) | (roy['Share'] > 0)] # Minimum 8 minutes per game

for stat in list(roy):
    roy[stat] = roy[stat].replace(np.nan, 0)
    
roy.to_csv(path + 'Algorithm/roy/roy_data.csv', index = False)

## DPOY

In [9]:
# Merge final data with DPOY shares data
dpoy = final_data.merge(pd.read_csv(path + 'Scrapping/dpoy_history.csv'), how = 'outer', on = ['Player', 'Year'])

In [10]:
# Mimimum games and minutes requirements, keep players with shares eitherway
dpoy = dpoy[(dpoy['G'] >= 20) | (dpoy['Share'] > 0)] # Minimum 20 games played
mvp = dpoy[(dpoy['MP'] >= 8) | (dpoy['Share'] > 0)] # Minimum 8 minutes per game

for stat in list(dpoy):
    dpoy[stat] = dpoy[stat].replace(np.nan, 0)
    
dpoy.to_csv(path + 'Algorithm/dpoy/dpoy_data.csv', index = False)

## SMOY

In [4]:
# Merge final data with SMOY shares data
smoy = final_data.merge(pd.read_csv(path + 'Scrapping/smoy_history.csv'), how = 'outer', on = ['Player', 'Year'])

In [5]:
smoy = smoy[smoy['Year'] > 1983] # SMOY first awarded in 1984
smoy['%GS'] = [gs/g for gs, g in zip(smoy['GS'], smoy['G'])] # compute proportion of game started
smoy = smoy[(smoy['%GS'] <= 0.5) | (smoy['Share'] > 0)] # 6th man of the year requirement, less than 50% of games started

# Mimimum games and minutes requirements, keep players with shares eitherway
smoy = smoy[(smoy['G'] >= 15) | (smoy['Share'] > 0)] # Minimum 15 games played
smoy = smoy[(smoy['MP'] >= 8) | (smoy['Share'] > 0)] # Minimum 8 minutes per game

for stat in list(smoy):
    smoy[stat] = smoy[stat].replace(np.nan, 0)
    
#smoy.to_csv(path + 'Algorithm/smoy/smoy_data.csv', index = False)

## MIP

In [13]:
# Merge final data with MIP shares data
mip = final_data.merge(pd.read_csv(path + 'Scrapping/mip_history.csv'), how = 'outer', on = ['Player', 'Year'])

In [14]:
# For each entry, determine the lifetime of the player career, its career high in points, assists, rebounds and minutes played
def years_in_league(df):
    df = df.sort_values('Year')
    df['lifetime'] = [len(df[df['Year'] <= yrs]) for yrs in df['Year']]
    pts_high, ast_high, trb_high, mp_high = [], [], [], []
    for yr in df['Year']:
        df_prev = df[df['Year'] < yr]
        pts_high.append(df_prev['PTS'].max())
        ast_high.append(df_prev['AST'].max())
        trb_high.append(df_prev['TRB'].max())
        mp_high.append(df_prev['MP'].max())
    df['PTS_high'] = pts_high
    df['AST_high'] = ast_high
    df['TRB_high'] = trb_high
    df['MP_high'] = mp_high
    return df

mip = mip.groupby('Player').apply(years_in_league)

In [15]:
# Drop data prior to 1987
mip = mip[mip['Year'] > 1986]
for stat in list(mip):
    mip[stat] = mip[stat].replace(np.nan, 0)

In [16]:
# Determine player previous playing year
def prev_year(df) :
    df['prev_year'] = df['Year'].rolling(2, center = True).min()
    return df
mip = mip.groupby('Player').apply(prev_year)

mip['prev_year'] = mip['prev_year'].replace(np.nan, 0) # If NaN (rookie season), assign 0 (easy to cut)
mip = mip[mip['Year'] - mip['prev_year'] == 1] # Only keep eligible players (with a previous year playing season)

In [17]:
# Merge eligible player data with their previous year data
mip = mip.merge(final_data.rename(columns = {'Year' : 'prev_year'}), on = ['Player', 'prev_year'], suffixes = ['', '_prev'], how = 'inner')

In [18]:
# Calculate the difference in stats for all numerical features
features =  ['G','GS','MP','FG','FGA','FG%','3P','3PA','3P%','2P','2PA','2P%','eFG%','FT','FTA','FT%','ORB','DRB','TRB','AST',
             'STL','BLK','TOV','PF','PTS','PER','TS%','3PAr','FTr','ORB%','DRB%','TRB%','AST%','STL%','BLK%','TOV%','USG%',
             'OWS','DWS','WS','WS/48','OBPM','DBPM','BPM','VORP','W','L','W/L%','PS/G','PA/G','SRS','Seed','MOV','ORtg','DRtg',
             'NRtg','MOV/A','ORtg/A','DRtg/A','NRtg/A','ORtg/100','DRtg/100']
for f in features :
    mip[f'{f}_diff'] = mip[f] - mip[f'{f}_prev']

In [19]:
mip['PTS_rel'] = [(pts-prev)/prev if prev > 0 else 0 for pts, prev in zip(mip['PTS'], mip['PTS_prev'])]
mip['PER_rel'] = (mip['PER'] - mip['PER_prev']) / mip['PER_prev']
mip['PTS_high_diff'] = mip['PTS'] - mip['PTS_high']

In [20]:
# Requirements (minutes and game)
mip = mip[(mip['MP'] >= 10) | (mip['Share'] > 0)] # Minimum 10 minutes played
mip = mip[(mip['MP_prev'] >= 5) | (mip['Share'] > 0)] # Minimum 5 minutes played in previous season
mip = mip[(mip['G'] >= 20) | (mip['Share'] > 0)] # Minimum 20 games played
mip = mip[(mip['G_prev'] >= 10) | (mip['Share'] > 0)] # Minimum 10 games played in previous season

# Eligibility
mip = mip[(mip['PTS'] > mip['PTS_high']) | (mip['Share'] > 0)] # Career high in points
mip = mip[(mip['AST'] >= mip['AST_high']) | (mip['TRB'] >= mip['TRB_high']) | (mip['MP'] >= mip['MP_high']) | (mip['Share'] > 0)] # At least a career high in minutes, assists or rebounds

In [21]:
mip.to_csv(path + 'Algorithm/mip/mip_data.csv', index = False)