### Data

In [38]:
import pandas as pd

adShoot = pd.read_csv("AdShooting.csv")
advanced = pd.read_csv("advanced.csv")
per100 = pd.read_csv("Per100.csv")
perGame = pd.read_csv("PerGame.csv")
PlaybyPlay = pd.read_csv("PlaybyPlay.csv")
shotType = pd.read_csv("ShotType.csv")
hustle = pd.read_csv("Hustle.csv", encoding='ISO-8859-1')


perGame['MP'] = perGame['MP'] * perGame['G']

perGame = perGame.drop(columns=['Rk'], errors='ignore') 
perGame = perGame.merge(per100[['Player', 'Rk']], on='Player', how='left')
perGame = perGame.sort_values(by='Rk', ascending=True)

player_ids = per100['Player']
player_idsH = hustle['Player']

#### Dropping Columns

In [39]:
adShoot = adShoot.drop(columns=["Awards" ,"-9999", "Rk"], axis=1)
advanced = advanced.drop(columns=["Awards" ,"Player-additional", "Rk"], axis=1)
per100 = per100.drop(columns=["Awards" ,"Player-additional", "Rk"], axis=1)
perGame = perGame.drop(columns=["Awards" ,"Player-additional", "Rk"], axis=1)
PlaybyPlay = PlaybyPlay.drop(columns=["Awards" ,"-9999", "Rk"], axis=1)
shotType = shotType.drop(columns=["Awards" ,"-9999", "Rk"], axis=1)

#### Data Cleaning

In [40]:
players_1 = set(adShoot['Player'].str.strip())
players_2 = set(hustle['Player'].str.strip())


only_in_df1 = players_1 - players_2
only_in_df2 = players_2 - players_1

print("Players only in df1:", only_in_df1)
print("Players only in df2:", only_in_df2)

Players only in df1: {'Kevin Knox', 'Tidjane Salaün', 'Nikola Jokić', 'Jimmy Butler', 'Armel Traoré', 'A.J. Green', 'GG Jackson II', 'Terry Taylor', 'Trey Jemison', 'Bogdan Bogdanović', 'Alondes Williams', 'Alperen Şengün', 'Lester Quiñones', 'Tristan Da Silva', 'Jonas Valančiūnas', 'Nikola Jović', 'Skal Labissière', 'Kristaps Porziņģis', 'Brandon Boston Jr.', 'Dario Šarić', 'Xavier Tillman Sr.', 'Vasilije Micić', 'Jeff Dowtin', 'Luka Dončić', 'Moussa Diabaté', 'Dennis Schröder', 'Jusuf Nurkić', 'Jeenathan Williams', 'Vlatko Čančar', 'Robert Williams', 'Karlo Matković', 'Nikola Vučević', 'Ron Holland'}
Players only in df2: {'Nikola Vu?evi?', 'Vasilije MiciÄ\x87', 'Luka DonÄ\x8diÄ\x87', 'Alperen Sengun', 'Karlo MatkoviÄ\x87', 'Jeff Dowtin Jr.', 'Lester Quinones', 'Bogdan BogdanoviÄ\x87', 'Tristan da Silva', 'Jimmy Butler III', 'Armel Traorï¿½', 'Moussa Diabatï¿½', 'AJ Green', 'Trey Jemison III', 'Vlatko Ä\x8canÄ\x8dar', 'Jonas ValanÄ\x8diÅ«nas', 'Ronald Holland II', 'Nikola Jovi?', 'Kri

In [41]:
# Correcting Player Names so they are consistent throughout 
corrections = {
    "Jeff Dowtin Jr.": "Jeff Dowtin",
    "Nikola JokiÄ\x87": "Nikola Jokić", 
    "Jonas ValanÄ\x8diÅ«nas": "Jonas Valančiūnas", 
    "Xavier Tillman": "Xavier Tillman Sr.", 
    "Alperen Sengun": "Alperen Şengün", 
    "Nikola Jovi?": "Nikola Jović", \
    "Tristan da Silva": "Tristan Da Silva", 
    "Ronald Holland II": "Ron Holland", 
    "Jimmy Butler III": "Jimmy Butler", 
    "Luka DonÄ\x8diÄ\x87": "Luka Dončić", 
    "Nikola Vu?evi?": "Nikola Vučević", 
    "Bogdan BogdanoviÄ\x87": "Bogdan Bogdanović", 
    "Kevin Knox II": "Kevin Knox", 
    "GG Jackson": "GG Jackson II", 
    "Robert Williams III": "Robert Williams", 
    "Trey Jemison III": "Trey Jemison", 
    "Kristaps PorziÅ\x86Ä£is": "Kristaps Porziņģis", 
    "Lester Quinones": "Lester Quiñones", 
    "Vasilije MiciÄ\x87": "Vasilije Micić",  
    "Karlo MatkoviÄ\x87": "Karlo Matković",
    "Jusuf NurkiÄ\x87": "Jusuf Nurkić", 
    "Dario Å\xa0ariÄ\x87": "Dario Šarić", 
    "AJ Green": "A.J. Green", 
    "Brandon Boston": "Brandon Boston Jr.", 
    "Skal Labissiere": "Skal Labissière", 
    "Vlatko Ä\x8canÄ\x8dar": "Vlatko Čančar"
}

hustle['Player'] = hustle['Player'].replace(corrections)

# Filter out unqualified players
hustle = hustle.loc[hustle['MP'] > 200]
hustle = hustle.fillna(0)

#print(hustle)


In [42]:
# Filter out players and fixing NA values
dataframes = ['adShoot', 'advanced', 'per100', 'perGame', 'PlaybyPlay', 'shotType']

for name in dataframes:
    df = globals()[name]
    globals()[name] = df.loc[df['MP'] > 200]

for name in dataframes:
    df = globals()[name]
    globals()[name] = df.fillna(0)

#### Standardizing Data

In [43]:
# Standardizing numeric values
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

for name in dataframes:
    df = globals()[name]
    globals()['stand_' + name] = df.select_dtypes(include='number')

stand_hustle = hustle.select_dtypes(include='number')
    
dataframesStand = ['stand_adShoot', 'stand_advanced', 'stand_per100',
               'stand_perGame', 'stand_PlaybyPlay', 'stand_shotType']

for name in dataframesStand:
    df = globals()[name]
    globals()[name] = df.drop(columns=["Age","G","GS","MP"])

stand_hustle = stand_hustle.drop(columns=["Age","GP","MP","Min"])

for name in dataframesStand:
    df = globals()[name]
    globals()[name] = ss.fit_transform(df)

stand_hustle = ss.fit_transform(stand_hustle)

In [44]:
# Turning Standardized NumpyArray Data back to dataframe with original column names 
# Retrieving column names
adShoot_drop = adShoot.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
PlaybyPlay_drop = PlaybyPlay.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
advanced_drop = advanced.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
per100_drop = per100.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
shotType_drop = shotType.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
perGame_drop = perGame.drop(columns=["Age", "G", "GS", "MP", "Player", "Team", "Pos"])
hustle_drop = hustle.drop(columns=["Age","GP","MP","Min","Player","Team"])

# Make sure the shapes match
assert stand_adShoot.shape[1] == len(adShoot_drop.columns)
assert stand_PlaybyPlay.shape[1] == len(PlaybyPlay_drop.columns)
assert stand_advanced.shape[1] == len(advanced_drop.columns)
assert stand_per100.shape[1] == len(per100_drop.columns)
assert stand_shotType.shape[1] == len(shotType_drop.columns)
assert stand_perGame.shape[1] == len(perGame_drop.columns)
assert stand_hustle.shape[1] == len(hustle_drop.columns)

# Adding the column names
df_adShoot = pd.DataFrame(stand_adShoot, columns=adShoot_drop.columns)
df_PlaybyPlay = pd.DataFrame(stand_PlaybyPlay, columns=PlaybyPlay_drop.columns)
df_advanced = pd.DataFrame(stand_advanced, columns=advanced_drop.columns)
df_per100 = pd.DataFrame(stand_per100, columns=per100_drop.columns)
df_shotType = pd.DataFrame(stand_shotType, columns=shotType_drop.columns)
df_perGame = pd.DataFrame(stand_perGame, columns=perGame_drop.columns)
df_hustle = pd.DataFrame(stand_hustle, columns=hustle_drop.columns)

In [45]:
dfNames = ['df_adShoot', 'df_PlaybyPlay', 'df_advanced', 
           'df_per100', 'df_shotType', 'df_perGame']

for name in dfNames:
    df = globals()[name]
    df.insert(0, 'Player', player_ids)

df_hustle.insert(0, 'Player', player_idsH)

#### Similarity Calculation

In [96]:
from sklearn.metrics.pairwise import cosine_similarity
from functools import reduce

dfNames = {'df_adShoot' : df_adShoot,
           'df_PlaybyPlay': df_PlaybyPlay,
           'df_hustle' : df_hustle,
           'df_advanced' : df_advanced, 
           'df_per100' : df_per100,
           'df_shotType' : df_shotType,
           'df_perGame' : df_perGame}

# Define weights for each dataset
weights = {
    'df_adShoot': 1.281,
    'df_PlaybyPlay': 1.437,
    'df_hustle': 1.280,
    'df_advanced': 1.585,
    'df_per100': 1.559,
    'df_shotType': 1.324,
    'df_perGame': 1.534,
}

# target_player = "Jalen Brunson"  # Example
target_player = input("Player Name: ").strip()

# Computing individual Scores
def compute_similarity(df: pd.DataFrame, target_player: str, dataset_name: str) -> pd.DataFrame:
    features = df.drop(columns='Player')
    players = df['Player']
    
    if target_player not in players.values:
        raise ValueError(f"{target_player} not found in dataset: {dataset_name}")
    
    target_index = players[players == target_player].index[0]
    target_vector = features.iloc[target_index].values.reshape(1, -1)
    
    similarities = cosine_similarity(target_vector, features.values)[0]
    return pd.DataFrame({'Player': players, f'Sim_{dataset_name}': similarities})

# Calculating Initial Score similarity
similarity_dfs = []

for name, df in dfNames.items():
    sim_df = compute_similarity(df, target_player, name)
    similarity_dfs.append(sim_df)

final_df = reduce(lambda left, right: pd.merge(left, right, on='Player'), similarity_dfs)
similarity_cols = [col for col in final_df.columns if col.startswith('Sim_')]

# Apply weights to calculate weighted total score
final_df['Total_Score'] = 0
for col in similarity_cols:
    # Extract dataset name from column (e.g., 'Sim_df_adShoot' -> 'df_adShoot')
    dataset_name = col.replace('Sim_', '')
    weight = weights.get(dataset_name, 1.0)  # Default weight of 1.0 if not found
    final_df['Total_Score'] += final_df[col] * weight

top_similar_players = (
    final_df[final_df['Player'] != target_player]
    .sort_values('Total_Score', ascending=False)
    .head(10)
)

print("\nTop 10 most similar players to", target_player)
print(top_similar_players[['Player', 'Total_Score']])


Top 10 most similar players to Anthony Edwards
               Player  Total_Score
50   Donovan Mitchell     8.492488
13       Jayson Tatum     8.126997
7         Tyler Herro     8.108710
72     Damian Lillard     8.063273
47      Stephen Curry     7.601089
29         Coby White     7.519779
4        James Harden     7.491936
122      Kyrie Irving     7.455849
91      Norman Powell     7.345042
21      Austin Reaves     7.294163
