In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import re
from scrape_player_data import request_stats_data, export_salary_data, export_stats_data, request_income_data, check_player_duplicates
from setup_database import MySQLConnector
from model import RegressionModel
from collections import Counter

In [2]:
for i in range(2001, 2025):
    print(f'### EXPORTING FOR SEASON {i} ###')
    export_salary_data(i)
    export_stats_data(i)

    dat = pd.read_csv(f'player_season_income/{i}_income_data.csv')
    check_player_duplicates(df = dat)


### EXPORTING FOR SEASON 2001 ###
### EXPORTING FOR SEASON 2002 ###
### EXPORTING FOR SEASON 2003 ###
### EXPORTING FOR SEASON 2004 ###
### EXPORTING FOR SEASON 2005 ###
### EXPORTING FOR SEASON 2006 ###
### EXPORTING FOR SEASON 2007 ###
### EXPORTING FOR SEASON 2008 ###
### EXPORTING FOR SEASON 2009 ###
### EXPORTING FOR SEASON 2010 ###
### EXPORTING FOR SEASON 2011 ###
### EXPORTING FOR SEASON 2012 ###
### EXPORTING FOR SEASON 2013 ###
### EXPORTING FOR SEASON 2014 ###
['Tony Mitchell']
### EXPORTING FOR SEASON 2015 ###
### EXPORTING FOR SEASON 2016 ###
### EXPORTING FOR SEASON 2017 ###
### EXPORTING FOR SEASON 2018 ###
### EXPORTING FOR SEASON 2019 ###
### EXPORTING FOR SEASON 2020 ###
### EXPORTING FOR SEASON 2021 ###
### EXPORTING FOR SEASON 2022 ###
### EXPORTING FOR SEASON 2023 ###
### EXPORTING FOR SEASON 2024 ###


In [3]:
# Dropping only duplicate that comes from export
dat = pd.read_csv(f'player_season_income/2014_income_data.csv')
TM_index = dat.index[dat['Player'] == "Tony Mitchell"].tolist()
dat.drop(max(TM_index), inplace = True)

check_player_duplicates(dat)

### Data pre-processing

In [4]:
dfs = []
for year in range(2001, 2025):
    df = pd.read_csv(f'player_season_income/{year}_income_data.csv')
    dfs.append(df)

all_seasons_income = pd.concat(dfs)

# Removes special characters except space (' ')
def remove_special_characters(text):
    return re.sub(r'[^\w\s]', '', text)

all_seasons_income['Player'] = all_seasons_income['Player'].apply(remove_special_characters)

In [5]:
# Assigning a player_id to each unique player in all the data
all_players = pd.unique(all_seasons_income['Player'])

players_dict = {key: None for key in all_players}

for i in range(0, len(players_dict)):
    players_dict[all_players[i]] = i + 1

# Maps unique id
all_seasons_income['player_id'] = all_seasons_income['Player'].map(players_dict)

del dfs

In [6]:
# Stats data
dfs = []
for year in range(2001, 2025):
    df = pd.read_csv(f'per_game_stats_data/{year}_player_data.csv')
    dfs.append(df)

all_stats = pd.concat(dfs)
all_stats['Player'] = all_stats['Player'].apply(remove_special_characters)
print('Left out players: ', sorted(set(all_stats['Player']) - set(all_seasons_income['Player'])))

# Maps unique id and drops rows that didn't get assigned an player_id
all_stats['player_id'] = all_stats['Player'].map(players_dict)
all_stats.dropna(subset=['player_id'], inplace = True)
#all_stats = all_stats.astype({'3P%': 'float64', '2P%': 'float64', 'eFG%': 'float64', 'FG%': 'float64', 'FT%': 'float64'})

Left out players:  ['Adonis Thomas', 'Aleksandar RadojeviÄ', 'Alen SmailagiÄ', 'Alexis AjinÃa', 'Anderson VarejÃo', 'Andris BiedriÅÅ', 'AndrÃs Nocioni', 'Andy Panko', 'Ante Å½iÅ¾iÄ', 'Anthony Grundy', 'Antoine Rigaudeau', 'AnÅ¾ejs PaseÄÅiks', 'Boban MarjanoviÄ', 'Bogdan BogdanoviÄ', 'Bojan BogdanoviÄ', 'BoÅtjan Nachbar', 'Brandin Knight', 'Brandon Boston Jr', 'Bruno Å\xa0undov', 'Byron Mullens', 'Cam Whitmore', 'Cameron Reynolds', 'Cat Barber', 'Cezary TrybaÅski', 'Chance Comanche', 'Charlie Brown Jr', 'Chaundee Brown Jr', 'Chris Garner', 'Chucky Brown', 'Craig Porter Jr', 'Cristiano FelÃcio', 'DJ Mbenga', 'Dairis BertÄns', 'Dalibor BagariÄ', 'Damjan RudeÅ¾', 'Dan McClintock', 'Danuel House Jr', 'Dario Å\xa0ariÄ', 'Darko MiliÄiÄ', 'David Duke Jr', 'David Vanterpool', 'Dennis SchrÃder', 'Dennis Smith Jr', 'Dereck Lively II', 'Derrick Byars', 'Derrick Jones Jr', 'Derrick Zimmerman', 'Desmond Ferguson', 'Desmond Penigar', 'Devon Hall', 'Dmytro Skapintsev', 'Donatas MotiejÅnas', 'Donny Mar

In [7]:
# Keeps track of (season, player_id) pairs that played for multiple teams (ie players that played for more than 1 team in 1 season)

multiple_teams_CPK = list(all_stats[all_stats.duplicated(subset = ['Player', 'season'], keep = False)][['season', 'player_id']].groupby(['season', 'player_id']).groups)
#all_stats[(all_stats['player_id'] == 23) & (all_stats['season'] == 2001)]

In [8]:
agg_functions = {key: None for key in all_stats.columns}
del agg_functions['Player']
del agg_functions['season']
agg_functions.update(dict.fromkeys(['Pos', 'player_id', 'Age', 'Tm'], 'first'))
agg_functions.update(dict.fromkeys(['G', 'GS'], 'sum'))
agg_functions.update(dict.fromkeys(['MP', 'FG', 'FGA', 'FG%', '3P',
       '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB',
       'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'], 'mean'))

In [9]:
all_stats = all_stats.groupby(['Player', 'season']).aggregate(agg_functions).reset_index()

for index, row in all_stats.iterrows():
    if (row['season'], row['player_id']) in multiple_teams_CPK:
        all_stats.loc[index, 'Tm'] = 'TOT'

In [11]:
# Exporing data:
all_stats.to_csv('all_player_stats.csv', index = False)
all_seasons_income.to_csv('all_season_income.csv', index = False)