In [1]:
# !pip install basketball_reference_web_scraper
from basketball_reference_web_scraper import client
import unicodedata
import pandas as pd
import numpy as np
from scipy.spatial import distance

pd.options.display.max_columns=100

In [2]:
#replace this file path with the filepath to old_bio_data.csv
bio_old = pd.read_csv('/Users/maxwalker/Desktop/Python Projects/old_bio_data.csv')

#replace this filepath with the filepath to new_bio_data.csv
new_data=pd.read_csv('new_bio_data.csv').groupby('player_name', as_index=False).mean()

#these files will be used later on

In [3]:
#scrapes the web for the data
client.output_type = "OutputType.CSV"
total = pd.DataFrame()

for num in range(1980, 2021):
    advData = pd.DataFrame(client.players_advanced_season_totals(season_end_year=num))
    advData['year']=num
   
    seasonTotal = pd.DataFrame(client.players_season_totals(season_end_year=num))
    seasonTotal['year']=num
    
    all_stats = pd.concat([advData, seasonTotal], axis=1)
    all_stats= all_stats.loc[:,~all_stats.columns.duplicated()]
    all_stats=all_stats[all_stats['games_played']>30]
    
    total=total.append(all_stats)

In [5]:
#convert totals to a per game basis
to_per_game = ['made_field_goals', 'attempted_field_goals',
       'made_three_point_field_goals', 'attempted_three_point_field_goals',
       'made_free_throws', 'attempted_free_throws', 'offensive_rebounds',
       'defensive_rebounds', 'assists', 'steals', 'blocks', 'turnovers',
       'personal_fouls', 'points']


total[to_per_game] = round(total[to_per_game].div(total['games_played'], axis=0), 3)

In [6]:
#drops unneccessary columns or columns that may provide duplicate data
total=total.drop(columns=['personal_fouls','total_rebound_percentage', 'slug', 'age', 'games_played', 'team', 'minutes_played', 'player_efficiency_rating', 'value_over_replacement_player', 
                          'year', 'is_combined_totals', 'offensive_box_plus_minus', 'defensive_box_plus_minus', 'games_started', 'value_over_replacement_player', 'box_plus_minus', 'win_shares', 'win_shares_per_48_minutes'])

In [7]:
#clean the 'name' column
def clean_names(df, col):
    df[col]= df[col].apply(lambda x: unicodedata.normalize('NFD', x).encode('ascii', 'ignore'))
    df[col]= df[col].astype('str')
    df[col]= df[col].str.replace("b'", '').str.replace("'", '').str.replace('"', '').str.lstrip('b')
    return df[col]

total['name'] = clean_names(total, 'name')   

In [8]:
#represent the positions column as ordinal numeric
total['positions'] = total['positions'].apply(lambda x: str(x[0]))
position_mapping = {"Position.POINT_GUARD": 1,'Position.SHOOTING_GUARD': 2,'Position.SMALL_FORWARD': 3,'Position.POWER_FORWARD': 4,'Position.CENTER': 5}
total['positions'] = total['positions'].map(position_mapping)


In [9]:
#take the maximum stats for each player to understand who players at their peak. We aren't interested in comparing players' rookie or final season.
total=total.groupby(['name']).max()



---
Next we will add biometric data, specifically height and weight 

In [10]:
#Reading and cleaning a dataset with biometric data from players prior to 2016...will use another dataset below for the more recent players
bio_old=bio_old[['Player', 'height', 'weight']]
bio_old['Player']=bio_old['Player'].str.replace('*', '').str.replace("'", '')
bio_old=bio_old.set_index('Player')
bio_old

Unnamed: 0_level_0,height,weight
Player,Unnamed: 1_level_1,Unnamed: 2_level_1
Curly Armstrong,180.0,77.0
Cliff Barker,188.0,83.0
Leo Barnhorst,193.0,86.0
Ed Bartels,196.0,88.0
Ralph Beard,178.0,79.0
...,...,...
Troy Williams,198.0,97.0
Kyle Wiltjer,208.0,108.0
Stephen Zimmerman,213.0,108.0
Paul Zipser,203.0,97.0


In [11]:
#adding the height and weight for each player
combined=total.join(bio_old, on='name', how='left').reset_index()

# older players now  have complete data but newer players dont
combined

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
0,A.C. Green,4,0.603,0.206,0.592,12.4,20.9,7.3,1.8,2.0,17.7,17.6,6.7,3.5,5.671,11.293,0.524,1.549,4.146,5.573,3.732,5.875,1.671,1.146,1.013,1.463,14.683,203.0,106.0
1,A.J. English,2,0.480,0.054,0.274,5.0,6.3,18.0,0.9,0.6,15.1,25.6,0.1,0.7,4.519,10.444,0.074,0.443,1.827,2.243,0.943,1.160,2.529,0.395,0.214,1.629,10.938,196.0,95.0
2,A.J. Guyton,1,0.495,0.504,0.111,2.3,6.3,23.6,0.9,0.9,12.6,22.2,0.4,0.0,2.364,5.818,1.022,2.733,0.489,0.600,0.303,0.788,1.939,0.273,0.156,0.822,6.000,208.0,99.0
3,A.J. Price,1,0.530,0.506,0.253,2.6,9.7,26.4,2.0,0.3,14.5,22.7,1.0,1.2,2.825,7.246,1.228,3.509,1.080,1.620,0.351,1.649,3.596,0.625,0.054,1.123,7.737,213.0,124.0
4,Aaron Brooks,1,0.562,0.502,0.224,4.2,8.1,26.0,1.7,0.9,17.2,26.1,3.9,2.2,7.012,16.232,2.549,6.402,2.988,3.634,0.659,1.963,5.293,0.841,0.183,2.829,19.561,183.0,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2306,Zaza Pachulia,5,0.612,0.021,0.772,13.8,25.7,16.3,2.4,2.2,19.6,21.3,3.4,2.9,4.042,8.731,0.000,0.059,4.069,5.181,3.385,6.171,2.566,1.141,0.500,2.308,12.153,211.0,122.0
2307,Zeljko Rebraca,5,0.625,0.000,0.468,8.4,20.5,5.7,1.3,4.4,15.7,21.4,1.4,1.9,2.554,5.054,0.000,0.000,1.824,2.365,1.135,2.784,0.514,0.378,0.986,1.135,6.932,213.0,116.0
2308,Zendon Hamilton,5,0.616,0.000,1.011,14.1,23.9,4.8,1.3,1.6,16.4,20.3,1.1,0.6,1.907,4.537,0.000,0.000,2.185,3.352,2.037,2.648,0.283,0.389,0.333,1.111,6.000,211.0,113.0
2309,Zoran Planinic,2,0.534,0.306,0.465,4.4,12.0,23.9,2.6,0.5,18.8,22.2,0.1,0.8,1.814,4.047,0.349,1.000,1.070,1.535,0.442,1.163,1.388,0.581,0.071,0.953,5.047,201.0,88.0


In [12]:
#Filling these missing data points
lacking_data =  combined[combined.isnull().any(axis=1)].reset_index(drop=True)
players_needed = list(lacking_data['name'])

new_data = new_data[new_data['player_name'].isin(players_needed)]
new_data=new_data[['player_name', 'player_height', 'player_weight']]
new_data.columns =['name', 'height', 'weight']
new_data=new_data.set_index('name')
new_data

Unnamed: 0_level_0,height,weight
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaron Holiday,184.150000,83.914520
Abdel Nader,197.273333,102.814187
Alex Caruso,195.580000,84.368112
Alfonzo McKinnie,202.353333,97.522280
Allonzo Trier,194.310000,90.718400
...,...,...
Tyler Herro,195.580000,88.450440
Tyrone Wallace,195.580000,89.811216
Vinny Del Negro,193.040000,88.450440
Wendell Carter Jr.,207.010000,119.067900


In [13]:
new_data

Unnamed: 0_level_0,height,weight
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Aaron Holiday,184.150000,83.914520
Abdel Nader,197.273333,102.814187
Alex Caruso,195.580000,84.368112
Alfonzo McKinnie,202.353333,97.522280
Allonzo Trier,194.310000,90.718400
...,...,...
Tyler Herro,195.580000,88.450440
Tyrone Wallace,195.580000,89.811216
Vinny Del Negro,193.040000,88.450440
Wendell Carter Jr.,207.010000,119.067900


In [14]:
new_data.loc['Ja Morant']

height    190.500000
weight     78.925008
Name: Ja Morant, dtype: float64

In [15]:
#adding the height and weight to the players that had missing data
lacking_data = lacking_data.drop(columns=['height', 'weight'])
lacking_data = lacking_data.merge(new_data, on='name')
lacking_data.head(5)

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
0,Aaron Holiday,1,0.518,0.485,0.191,1.7,10.4,19.5,1.6,1.8,12.4,21.9,0.4,1.4,3.483,8.552,1.379,3.5,1.069,1.241,0.362,1.914,3.328,0.776,0.26,1.293,9.414,184.15,83.91452
1,Abdel Nader,3,0.58,0.477,0.282,2.8,15.7,7.1,1.4,2.3,17.0,17.1,0.4,0.9,2.083,4.583,0.812,2.188,1.0,1.292,0.292,1.672,0.729,0.417,0.417,0.75,5.979,197.273333,102.814187
2,Alex Caruso,1,0.538,0.412,0.292,2.9,10.1,17.8,2.8,1.4,24.5,13.9,0.6,1.6,1.897,4.483,0.655,1.845,1.0,1.31,0.405,1.69,2.027,1.034,0.293,1.135,5.448,195.58,84.368112
3,Alfonzo McKinnie,3,0.569,0.429,0.181,9.0,16.9,4.0,1.9,1.3,12.3,15.2,1.2,0.9,1.861,4.275,0.583,1.639,0.55,0.775,1.125,2.306,0.431,0.575,0.208,0.65,4.681,202.353333,97.52228
4,Allonzo Trier,2,0.564,0.255,0.431,2.2,12.2,12.9,0.9,0.8,15.8,21.5,0.4,0.5,3.625,8.094,0.812,2.062,2.797,3.484,0.484,2.594,1.859,0.438,0.219,1.812,10.859,194.31,90.7184


In [16]:
lacking_data[lacking_data['name']=='Ja Morant']

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
65,Ja Morant,1,0.568,0.174,0.331,2.7,9.5,34.6,1.4,0.7,17.2,26.0,2.2,1.2,6.661,13.559,0.864,2.356,3.458,4.492,0.746,2.78,6.932,0.915,0.254,3.237,17.644,190.5,78.925008


In [17]:
# combined= combined.loc[~combined['name'].isin(lacking_data['name'])]
combined= pd.concat([combined, lacking_data])
combined

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
0,A.C. Green,4,0.603,0.206,0.592,12.4,20.9,7.3,1.8,2.0,17.7,17.6,6.7,3.5,5.671,11.293,0.524,1.549,4.146,5.573,3.732,5.875,1.671,1.146,1.013,1.463,14.683,203.000000,106.000000
1,A.J. English,2,0.480,0.054,0.274,5.0,6.3,18.0,0.9,0.6,15.1,25.6,0.1,0.7,4.519,10.444,0.074,0.443,1.827,2.243,0.943,1.160,2.529,0.395,0.214,1.629,10.938,196.000000,95.000000
2,A.J. Guyton,1,0.495,0.504,0.111,2.3,6.3,23.6,0.9,0.9,12.6,22.2,0.4,0.0,2.364,5.818,1.022,2.733,0.489,0.600,0.303,0.788,1.939,0.273,0.156,0.822,6.000,208.000000,99.000000
3,A.J. Price,1,0.530,0.506,0.253,2.6,9.7,26.4,2.0,0.3,14.5,22.7,1.0,1.2,2.825,7.246,1.228,3.509,1.080,1.620,0.351,1.649,3.596,0.625,0.054,1.123,7.737,213.000000,124.000000
4,Aaron Brooks,1,0.562,0.502,0.224,4.2,8.1,26.0,1.7,0.9,17.2,26.1,3.9,2.2,7.012,16.232,2.549,6.402,2.988,3.634,0.659,1.963,5.293,0.841,0.183,2.829,19.561,183.000000,73.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Tyler Herro,2,0.534,0.476,0.148,1.3,14.6,11.1,1.1,0.5,11.3,22.0,-0.3,1.1,4.681,11.319,2.106,5.383,1.404,1.681,0.298,3.681,1.936,0.638,0.149,1.532,12.872,195.580000,88.450440
162,Tyrone Wallace,1,0.450,0.088,0.263,3.5,13.2,9.5,1.6,0.9,12.9,18.7,-0.8,0.5,1.484,3.500,0.065,0.306,0.484,0.919,0.323,1.306,0.677,0.339,0.113,0.581,3.516,195.580000,89.811216
163,Vinny Del Negro,2,0.566,0.211,0.272,3.3,10.3,27.6,2.0,0.5,16.0,19.3,5.2,3.2,5.829,11.732,0.880,2.160,2.171,2.610,0.600,2.878,4.156,1.037,0.187,1.461,14.524,193.040000,88.450440
164,Wendell Carter Jr.,5,0.590,0.086,0.443,11.6,24.6,10.7,1.3,4.5,14.9,19.1,1.7,1.6,4.256,8.432,0.140,0.727,2.605,3.535,3.233,6.186,1.773,0.767,1.318,1.674,11.256,207.010000,119.067900


In [18]:
combined = combined.dropna(axis=0) 
combined

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
0,A.C. Green,4,0.603,0.206,0.592,12.4,20.9,7.3,1.8,2.0,17.7,17.6,6.7,3.5,5.671,11.293,0.524,1.549,4.146,5.573,3.732,5.875,1.671,1.146,1.013,1.463,14.683,203.000000,106.000000
1,A.J. English,2,0.480,0.054,0.274,5.0,6.3,18.0,0.9,0.6,15.1,25.6,0.1,0.7,4.519,10.444,0.074,0.443,1.827,2.243,0.943,1.160,2.529,0.395,0.214,1.629,10.938,196.000000,95.000000
2,A.J. Guyton,1,0.495,0.504,0.111,2.3,6.3,23.6,0.9,0.9,12.6,22.2,0.4,0.0,2.364,5.818,1.022,2.733,0.489,0.600,0.303,0.788,1.939,0.273,0.156,0.822,6.000,208.000000,99.000000
3,A.J. Price,1,0.530,0.506,0.253,2.6,9.7,26.4,2.0,0.3,14.5,22.7,1.0,1.2,2.825,7.246,1.228,3.509,1.080,1.620,0.351,1.649,3.596,0.625,0.054,1.123,7.737,213.000000,124.000000
4,Aaron Brooks,1,0.562,0.502,0.224,4.2,8.1,26.0,1.7,0.9,17.2,26.1,3.9,2.2,7.012,16.232,2.549,6.402,2.988,3.634,0.659,1.963,5.293,0.841,0.183,2.829,19.561,183.000000,73.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,Tyler Herro,2,0.534,0.476,0.148,1.3,14.6,11.1,1.1,0.5,11.3,22.0,-0.3,1.1,4.681,11.319,2.106,5.383,1.404,1.681,0.298,3.681,1.936,0.638,0.149,1.532,12.872,195.580000,88.450440
162,Tyrone Wallace,1,0.450,0.088,0.263,3.5,13.2,9.5,1.6,0.9,12.9,18.7,-0.8,0.5,1.484,3.500,0.065,0.306,0.484,0.919,0.323,1.306,0.677,0.339,0.113,0.581,3.516,195.580000,89.811216
163,Vinny Del Negro,2,0.566,0.211,0.272,3.3,10.3,27.6,2.0,0.5,16.0,19.3,5.2,3.2,5.829,11.732,0.880,2.160,2.171,2.610,0.600,2.878,4.156,1.037,0.187,1.461,14.524,193.040000,88.450440
164,Wendell Carter Jr.,5,0.590,0.086,0.443,11.6,24.6,10.7,1.3,4.5,14.9,19.1,1.7,1.6,4.256,8.432,0.140,0.727,2.605,3.535,3.233,6.186,1.773,0.767,1.318,1.674,11.256,207.010000,119.067900


In [19]:
combined[combined['name']=='Ja Morant']

Unnamed: 0,name,positions,true_shooting_percentage,three_point_attempt_rate,free_throw_attempt_rate,offensive_rebound_percentage,defensive_rebound_percentage,assist_percentage,steal_percentage,block_percentage,turnover_percentage,usage_percentage,offensive_win_shares,defensive_win_shares,made_field_goals,attempted_field_goals,made_three_point_field_goals,attempted_three_point_field_goals,made_free_throws,attempted_free_throws,offensive_rebounds,defensive_rebounds,assists,steals,blocks,turnovers,points,height,weight
65,Ja Morant,1,0.568,0.174,0.331,2.7,9.5,34.6,1.4,0.7,17.2,26.0,2.2,1.2,6.661,13.559,0.864,2.356,3.458,4.492,0.746,2.78,6.932,0.915,0.254,3.237,17.644,190.5,78.925008


In [21]:
#We will use all the traits (except name) to make comparisons
inputs= combined.columns.drop('name')

In [22]:
#lets standardize the data so our comparison isnt influenced too heavily by certain traits
combined[inputs] = (combined[inputs] - combined[inputs].mean()) / combined[inputs].std()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [23]:
#weighting physical traits more heavily because there are fewer of them
combined['height'] *= 5
combined['weight'] *= 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
#determine actual comparisons

def player_comp(players): #takes a list of players
    for player in players:
        player_data = combined[combined['name']==player][inputs]
        euclidean_distances = combined.apply(lambda row: distance.euclidean(row[inputs], player_data), axis=1)
        distance_frame = pd.DataFrame(data={"dist": euclidean_distances, "id": euclidean_distances.index})
        distance_frame.sort_values("dist", inplace=True)
        second_smallest = distance_frame.iloc[1]["id"] #the smallest will always be the player themselves
        third_smallest= distance_frame.iloc[2]["id"]
        fourth_smallest= distance_frame.iloc[3]["id"]

        most_similar = combined.loc[int(second_smallest)]["name"]
        second_similar = combined.loc[int(third_smallest)]["name"]
        third_similar = combined.loc[int(fourth_smallest)]["name"]
        print(player + '\'s comparisons are ' +most_similar +', ' + second_similar+ ', and ' + third_similar + '.')

In [25]:
combined=combined.sort_values('name')
combined=combined.drop_duplicates().reset_index(drop=True)


In [26]:
players=list(combined['name'])

In [27]:
player_comp(players)

A.C. Green's comparisons are Kermit Washington, Grant Long, and Kenneth Faried.
A.J. English's comparisons are Harold Miner, Howard Carter, and Austin Carr.
A.J. Guyton's comparisons are Mindaugas Kuzminskas, Dzanan Musa, and P.J. Hairston.
A.J. Price's comparisons are Sasha Pavlovic, C.J. Watson, and J.R. Bremer.
Aaron Brooks's comparisons are Chucky Atkins, Troy Hudson, and Brandon Jennings.
Aaron Gordon's comparisons are Luol Deng, Dario Saric, and Kyle Kuzma.
Aaron Gray's comparisons are Eric Leckner, Kosta Koufos, and Eric Montross.
Aaron Holiday's comparisons are Yogi Ferrell, Ky Bowman, and Salim Stoudamire.
Aaron McKie's comparisons are Kent Bazemore, Anthony Johnson, and Shandon Anderson.
Aaron Williams's comparisons are Joe Meriweather, Nazr Mohammed, and Bill Garnett.
Abdel Nader's comparisons are Sterling Brown, Justin Anderson, and Orlando Johnson.
Abdul Jeelani's comparisons are Steve Mix, Duane Ferrell, and Scott May.
Acie Earl's comparisons are Alaa Abdelnaby, Jerrod Mu

In [28]:
player_comp(['Luka Doncic', 'Trae Young', 'Kobe Bryant'])

Luka Doncic's comparisons are DeMar DeRozan, Tyreke Evans, and Carmelo Anthony.
Trae Young's comparisons are Isaiah Thomas, Stephon Marbury, and Damian Lillard.
Kobe Bryant's comparisons are Jerry Stackhouse, Dwyane Wade, and Vince Carter.
