In [1]:
import pandas as pd
import math
from sklearn.metrics import DistanceMetric

In [2]:
df = pd.read_csv("data/21-22.csv", delimiter=";")

In [3]:
df.columns

Index(['Rk', 'Player', 'Pos', 'Age', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS'],
      dtype='object')

In [4]:
nba_df = df.loc[:,['Player', 'Tm', 'G', 'GS', 'MP', 'FG', 'FGA', 'FG%',
       '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%',
       'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']]

nba_df

Unnamed: 0,Player,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,Precious Achiuwa,TOR,73,28,23.6,3.6,8.3,0.439,0.8,2.1,...,0.595,2.0,4.5,6.5,1.1,0.5,0.6,1.2,2.1,9.1
1,Steven Adams,MEM,76,75,26.3,2.8,5.1,0.547,0.0,0.0,...,0.543,4.6,5.4,10.0,3.4,0.9,0.8,1.5,2.0,6.9
2,Bam Adebayo,MIA,56,56,32.6,7.3,13.0,0.557,0.0,0.1,...,0.753,2.4,7.6,10.1,3.4,1.4,0.8,2.6,3.1,19.1
3,Santi Aldama,MEM,32,0,11.3,1.7,4.1,0.402,0.2,1.5,...,0.625,1.0,1.7,2.7,0.7,0.2,0.3,0.5,1.1,4.1
4,LaMarcus Aldridge,BRK,47,12,22.3,5.4,9.7,0.550,0.3,1.0,...,0.873,1.6,3.9,5.5,0.9,0.3,1.0,0.9,1.7,12.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,Thaddeus Young,TOR,26,0,18.3,2.6,5.5,0.465,0.7,1.7,...,0.481,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3
808,Trae Young,ATL,76,76,34.9,9.4,20.3,0.460,3.1,8.0,...,0.904,0.7,3.1,3.7,9.7,0.9,0.1,4.0,1.7,28.4
809,Omer Yurtseven,MIA,56,12,12.6,2.3,4.4,0.526,0.0,0.2,...,0.623,1.5,3.7,5.3,0.9,0.3,0.4,0.7,1.5,5.3
810,Cody Zeller,POR,27,0,13.1,1.9,3.3,0.567,0.0,0.1,...,0.776,1.9,2.8,4.6,0.8,0.3,0.2,0.7,2.1,5.2


In [5]:
check = nba_df[nba_df['Player'] == 'Thaddeus Young']

check

Unnamed: 0,Player,Tm,G,GS,MP,FG,FGA,FG%,3P,3PA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
805,Thaddeus Young,TOT,52,1,16.3,2.7,5.2,0.518,0.3,0.9,...,0.469,1.5,2.5,4.0,2.0,1.0,0.3,1.0,1.6,6.2
806,Thaddeus Young,SAS,26,1,14.2,2.8,4.9,0.578,0.0,0.2,...,0.455,1.5,2.0,3.6,2.3,0.9,0.3,1.2,1.5,6.1
807,Thaddeus Young,TOR,26,0,18.3,2.6,5.5,0.465,0.7,1.7,...,0.481,1.5,2.9,4.4,1.7,1.2,0.4,0.8,1.7,6.3


In [6]:
# Group by player name and count number of rows
player_counts = df.groupby('Player').size()

# Filter out players with more than one row
single_team_players = player_counts[player_counts == 1].index

# Filter the dataframe to only keep rows with team name "TOT"
tot_players = df[df['Tm'] == 'TOT']

# Concatenate the filtered dataframe with the original dataframe
final_df = pd.concat([df[df['Player'].isin(single_team_players)], tot_players])


In [7]:
final_df[final_df['Player'] == 'Thaddeus Young']

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
805,601,Thaddeus Young,PF,33,TOT,52,1,16.3,2.7,5.2,...,0.469,1.5,2.5,4.0,2.0,1.0,0.3,1.0,1.6,6.2


In [8]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 605 entries, 0 to 805
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      605 non-null    int64  
 1   Player  605 non-null    object 
 2   Pos     605 non-null    object 
 3   Age     605 non-null    int64  
 4   Tm      605 non-null    object 
 5   G       605 non-null    int64  
 6   GS      605 non-null    int64  
 7   MP      605 non-null    float64
 8   FG      605 non-null    float64
 9   FGA     605 non-null    float64
 10  FG%     605 non-null    float64
 11  3P      605 non-null    float64
 12  3PA     605 non-null    float64
 13  3P%     605 non-null    float64
 14  2P      605 non-null    float64
 15  2PA     605 non-null    float64
 16  2P%     605 non-null    float64
 17  eFG%    605 non-null    float64
 18  FT      605 non-null    float64
 19  FTA     605 non-null    float64
 20  FT%     605 non-null    float64
 21  ORB     605 non-null    float64
 22  DR

In [9]:
cols = list(final_df.columns)
cols.remove('Player')
cols.remove('Tm')
cols.remove("Rk")
cols.remove("Pos")
cols.remove("Age")

print(cols)

for col in cols:
    final_df[col] = final_df[col].astype(float)
    min_val = final_df[col].min()
    max_val = final_df[col].max()

    final_df[col] = (final_df[col] - min_val) / (max_val - min_val)
    
    

['G', 'GS', 'MP', 'FG', 'FGA', 'FG%', '3P', '3PA', '3P%', '2P', '2PA', '2P%', 'eFG%', 'FT', 'FTA', 'FT%', 'ORB', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']


In [10]:
final_df.drop(['Rk', 'Pos', 'Age', 'Tm'], axis=1, inplace=True)

In [11]:
metric = 'euclidean'
dist = DistanceMetric.get_metric(metric)

target = "Steven Adams"

targetList = final_df.loc[final_df['Player'] == target].iloc[:, 1:].values.tolist()[0]

distanceDict = {}

for line in final_df.iterrows():
    player = line[1]['Player'] 
    
    playerListRaw = final_df.loc[final_df['Player'] == player].iloc[:, 1:].values.tolist()[0]
    
    playerList = []
        
    for value in playerListRaw:
        if math.isnan(value):
            playerList.append(0)
        else:
            playerList.append(value)

    
    
    X = [targetList, playerList]
 

    ans = dist.pairwise(X)

    value = ans[0][1]
    
    distanceDict[player] = value

In [12]:
sorted_list = sorted(distanceDict.items(), key=lambda x: x[1])

top_10 = sorted_list[:12]


print(f"Closests Players To: {target}\n")
print(f"Player Name   {metric.capitalize()} Distance")
print("----------------------------")
for index, item in enumerate(top_10):
    if index !=0:
        name = item[0]
        similarity = item[1]
        print(f"{name}: {similarity}")
        
       

Closests Players To: Steven Adams

Player Name   Euclidean Distance
----------------------------
Jarred Vanderbilt: 0.5489400462417386
Ivica Zubac: 0.6016324578707128
Kevon Looney: 0.6160436694909361
Mason Plumlee: 0.6329356506125128
Clint Capela: 0.6413053634499167
Andre Drummond: 0.6611066915693204
Mitchell Robinson: 0.6861489267984793
Isaiah Stewart: 0.6901576199274677
Robert Williams: 0.7670070981964667
Daniel Gafford: 0.8861972025522517
Dwight Powell: 0.9415887792134121
