In [75]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from unidecode import unidecode

In [76]:
#read in the csv
player_df = pd.read_csv('module3playerdata.csv', low_memory=False)

In [77]:
#drop the columns that arent needed
final_df = player_df.drop(columns=['full_name', 'version','description', 'image', 'potential', 'wage', 'preferred_foot', 'international_reputation', 'work_rate',
                                    'body_type', 'real_face', 'release_clause', 'specialities', 'club_id', 'club_logo', 'club_rating', 
                                    'club_kit_number', 'club_joined', 'club_contract_valid_until', 'country_id', 'country_league_id', 'country_league_name',
                                    'country_flag', 'country_rating', 'country_position', 'country_kit_number', 'play_styles', 'gk_diving', 'gk_handling', 'gk_kicking', 'club_position'])

#make the player names lowercase to make it easier to search
final_df['name'] = final_df['name'].str.lower()

#player_df = player_df.drop_duplicates()
final_df.drop_duplicates(subset=['name'])


#remove accented and other special characters from the players names. These were some of the most common ones I saw. 
mapping = {'á': 'a',
           'é': 'e',
           'í': 'i',
           'ó': 'o',
           'ú': 'u',
           'ñ': 'n',
           'ã': 'a',
           'ë': 'e',
           'ş': 's',
           'ă': 'a'}

final_df['name'] = final_df.name.replace(mapping, regex=True)
final_df['club_name'] = final_df.club_name.replace(mapping, regex=True)

In [78]:
#drop the goalkeepers
final_df = final_df[pd.to_numeric(final_df['gk_positioning'], errors='coerce').notnull()]
final_df = final_df[pd.to_numeric(final_df['gk_reflexes'], errors='coerce').notnull()]

In [79]:
final_df

Unnamed: 0,player_id,name,height_cm,weight_kg,dob,positions,overall_rating,value,weak_foot,skill_moves,...,interceptions,positioning,vision,penalties,composure,defensive_awareness,standing_tackle,sliding_tackle,gk_positioning,gk_reflexes
0,239085,erling haaland,195,94,7/21/00,ST,91,€185M,3,3,...,43,96,74,86,87,38,47,29,11,7.0
1,231747,kylian mbappe,182,75,12/20/98,"ST,LW",91,€181.5M,4,5,...,38,93,83,84,88,26,34,32,11,6.0
2,192985,kevin de bruyne,181,75,6/28/91,"CM,CAM",91,€103M,5,4,...,66,88,95,83,88,66,70,53,10,13.0
3,231866,rodri,191,82,6/22/96,"CDM,CM",90,€122.5M,4,3,...,84,76,84,62,87,92,87,78,14,8.0
4,202126,harry kane,188,85,7/28/93,ST,90,€119.5M,5,3,...,42,94,87,92,92,46,46,38,14,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18326,272761,lyu jiaqiang,180,65,4/11/05,CB,47,€100K,3,2,...,50,28,30,34,37,51,50,53,7,15.0
18327,275601,callum warren,172,67,3/16/05,CAM,47,€110K,3,2,...,30,49,49,40,51,29,38,34,7,10.0
18328,71064,ishaan shishodia,177,70,8/31/05,"CM,CAM",47,€110K,3,2,...,42,48,49,37,44,33,38,42,13,13.0
18329,269541,wu yuhang,182,75,2/16/01,CDM,47,€70K,3,2,...,43,37,35,41,39,40,45,51,13,8.0


In [80]:
scaler = StandardScaler()
numerical_features = ['crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve', 'fk_accuracy', 'long_passing', 'ball_control', 'acceleration',
                     'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots', 'aggression', 'interceptions', 'positioning', 
                     'vision', 'penalties', 'composure', 'defensive_awareness', 'standing_tackle', 'sliding_tackle']

#scale the numerical features
final_df[numerical_features] = scaler.fit_transform(final_df[numerical_features])

In [81]:
#This is used to calculate the cosine similarity based on player name. 
def find_similar_players(player_name, final_df):
    #this gets the row of the player you want
    player_row = final_df[final_df['name'] == player_name].index[0]
    
    #this gets that rows numerical features, which were defined above
    player_features = final_df.iloc[player_row][numerical_features].values.reshape(1,-1)
    
    #this takes the players features we defined and compares them to all of the other players in the dataset.
    similarities = cosine_similarity(player_features, final_df[numerical_features])
    
    return similarities

In [87]:
def top_similar_players(similarity_scores, final_df, top_n=10):
    #this sorts the players to give you the top ten
    similar_players_indices = similarity_scores.argsort()[0][::-1][1:top_n+1]
    #this finds those players based on the indices we created above
    similar_players = final_df.iloc[similar_players_indices]
    
    return similar_players

In [88]:
player_name = 'harry maguire'
similarity_scores = find_similar_players(player_name, final_df)
top_similar = top_similar_players(similarity_scores, final_df)
print('Top 10 Players Similar to', player_name, 'are:')
print(top_similar[['name', 'club_name', 'overall_rating', 'positions']])

Top 10 Players Similar to harry maguire are:
                  name            club_name  overall_rating positions
584   joachim andersen       Crystal Palace              79        CB
1852     steven nzonzi            Konyaspor              74    CDM,CM
2128  ibrahima sissoko           Strasbourg              74    CM,CDM
353     danilo pereira  Paris Saint Germain              81    CB,CDM
467        david lopez               Girona              80    CB,CDM
2715  tiemoue bakayoko              Lorient              73    CDM,CM
471        oriol romeu         FC Barcelona              80    CDM,CM
823          eric dier    FC Bayern München              78        CB
158    guido rodriguez           Real Betis              83       CDM
340   william carvalho           Real Betis              81   CDM,CAM


In [89]:
player_name = 'joshua kimmich'
similarity_scores = find_similar_players(player_name, final_df)
top_similar = top_similar_players(similarity_scores, final_df)
print('Top 10 Players Similar to', player_name, 'are:')
print(top_similar[['name', 'club_name', 'overall_rating', 'positions']])

Top 10 Players Similar to joshua kimmich are:
                       name        club_name  overall_rating   positions
1889            pavel bucha       Cincinnati              74  CDM,CM,CAM
702       stephen eustaquio            Porto              78      CDM,CM
316     oleksandr zinchenko          Arsenal              81          LB
1698         nicolas raskin          Rangers              75      CM,CDM
119                    koke  Atletico Madrid              84      CDM,CM
165        marcelo brozović         Al Nassr              83      CDM,CM
944   kiernan dewsbury-hall   Leicester City              77          CM
139     alexis mac allister        Liverpool              83  CM,CAM,CDM
105         ismael bennacer            Milan              84  CDM,CM,CAM
110         rodrigo de paul  Atletico Madrid              84          CM


In [90]:
player_name = 'robert lewandowski'
similarity_scores = find_similar_players(player_name, final_df)
top_similar = top_similar_players(similarity_scores, final_df)
print('Top 10 Players Similar to', player_name, 'are:')
print(top_similar[['name', 'club_name', 'overall_rating', 'positions']])

Top 10 Players Similar to robert lewandowski are:
                     name           club_name  overall_rating positions
361   alexandre lacazette  Olympique Lyonnais              81        ST
423           andre silva       Real Sociedad              80        ST
487      marko arnautović               Inter              80        ST
322             enes ünal     AFC Bournemouth              81        ST
1373           danny ings     West Ham United              76        ST
1060        raul de tomas      Rayo Vallecano              77        ST
15          karim benzema          Al Ittihad              89     CF,ST
4              harry kane   FC Bayern München              90        ST
722        mergim berisha      TSG Hoffenheim              78     ST,CF
331       serhou guirassy       VfB Stuttgart              81        ST


In [78]:
#https://github.com/prashantghimire/sofifa-web-scraper?tab=readme-ov-file