In [1]:
import re
import unicodedata

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', None)

In [2]:
data_dir = './Leagues'
files = [
    'all_players_stats',
    'all_players_values',
    'all_players_stats_details',
    
    'all_teams_values',
    'all_teams_stats'
]

In [3]:
aliases = {
    'Alisson Becker' : 'Alisson',
    'Thiago Alcântara' : 'Thiago',
    'Kepa' : 'Kepa Arrizabalaga',
    'Hamed Traorè' : 'Hamed Junior Traorè',
    'Andy Robertson' : 'Andrew Robertson',
    'Maximilian Kilman' : 'Max Kilman',
    'Toti Gomes' : 'Toti', 
    'Willy-Arnaud Boly' : 'Willy Boly',
    'Dan Bentley' : 'Daniel Bentley',
    'Vitalii Mykolenko' : 'Vitaliy Mykolenko',
    'Sam Amo-Ameyaw' : 'Samuel Amo-Ameyaw',
    'Kostas Tsimikas' : 'Konstantinos Tsimikas',
    'Alex Oxlade Chamberlain' : 'Alex Oxlade-Chamberlain',
    'Mykhailo Mudryk' : 'Mykhaylo Mudryk',
    'Illia Zabarnyi' : 'Ilya Zabarnyi',
    'Pape Sarr' : 'Pape Matar Sarr',
    'Mads Bech' : 'Mads Bech Sörensen',
    "Odeluga Offiah" : "Odel Offiah",
    "Karl Toko-Ekambi" : "Karl Toko Ekambi",
    "Alexsandro Ribeiro" : "Alexsandro",
    "Karl Toko-Ekambi" : "Karl Toko Ekambi",
    "Cheick Keita" : "Check Keita",
    "Yusuf Yazici" : "Yusuf Yazıcı",
    "Henrique" : "Henrique Silva",
    "Mickaël Barreto" : "Michaël Barreto",
    "Ruan" : "Ruan Levine",
    "Ben Touré" : "Ben Hamed Touré",
    "Noah Holm" : "Noah Jean Holm",
    "Eli Kroupi" : "Eli Junior Kroupi",
    "Rémy Labeau-Lascary" : "Rémy Labeau Lascary",
    "Valentin Atangana Edoa" : "Valentin Atangana",
    "Eric Choupo-Moting" : "Eric-Maxim Choupo-Moting",
    "Frederik Rønnow" : "Frederik Rönnow",
    "John Anthony Brooks" : "John Brooks",
    "Silas Katompa Mvumpa" : "Silas",
    "Jordan Siebatcheu" : "Jordan",
    "Mehmet Aydin" : "Mehmet Aydın",
    "Finn Becker" : "Finn Ole Becker",
    "Juan José Perea" : "Juan-José Perea",
    "Dikeni-Rafid Salifou" : "Dikeni Salifou",
    "Jean-Manuel Mbom" : "Jean Manuel Mbom",
    "Lucas Martínez" : "Lucas Martínez Quarta",
    "Ibañez" : "Roger Ibañez",
    "Giorgos Kyriakopoulos" : "Georgios Kyriakopoulos",
    "Giorgos Kyriakopoulos" : "",
    "José Palomino" : "José Luis Palomino",
    "Pawel Dawidowicz" : "Paweł Dawidowicz",
    "Przemyslaw Wisniewski" : "Przemysław Wiśniewski",
    "Simon Kjær" : "Simon Kjaer",
    "Joakim Mæhle" : "Joakim Maehle",
    "Igor Julio" : "Igor",
    "Hamed Traorè" : "Hamed Junior Traorè",
    "Nicolò Cocetta" : "Niccolò Cocetta",
    "Alberto Basso" : "Alberto Basso Ricci",
    "Emil Ceïde" : "Emil Konradsen Ceide",
    "Salvador Ferrer" : "Salva Ferrer",
    "Antonio Iervolino" : "Antonio Pio Iervolino",
    "Thórir Helgason" : "Thórir Jóhann Helgason",
    "Marios Oikonomou" : "Marios Ikonomou",
    "Christian Gytkjær" : "Christian Gytkjaer",
    "Mikael Ellertsson" : "Mikael Egill Ellertsson",
    "José Giménez" : "José María Giménez",
    "Reinildo" : "Reinildo Mandava",
    "Adri Embarba" : "Adrián Embarba",
    "Anuar Tuhami" : "Anuar",
    "Copete" : "José Copete",
    "Adri Embarba" : "Adrián Embarba",
    "Valentín Castellanos" : "Taty Castellanos",
    "Vinicius Souza" : "Vini Souza",
    "Nacho" : "Nacho Fernández",
    "Djené Dakonam" : "Djené",
    "Álex Pozo" : "Alejandro Pozo",
    "Dani Carvajal" : "Daniel Carvajal",
    "Yassine Bounou" : "Bono",
    "Jon Olasagasti" : "Jon Ander Olasagasti",
    "Rober" : "Rober González",
    "Anthony Lozano" : "Choco Lozano",
    "Malcom Adu" : "Adu Ares",
    "Mamadou Fall" : "Mamadou Mbacke",
    "José Mari Martín" : "José Mari",
    "Nabili Zoubdi Touaizi" : "Nabil Touaizi",
    "Simo" : "Simo Keddari",
    "Marezi" : "Marko Milovanovic",
}

In [4]:
players_stats = pd.read_csv(f'{data_dir}/all_players_stats.csv', encoding='utf-8')
players_values = pd.read_csv(f'{data_dir}/all_players_values.csv', encoding='utf-8')
players_details = pd.read_csv(f'{data_dir}/all_players_stats_details.csv', encoding='utf-8')

teams_stats = pd.read_csv(f'{data_dir}/all_teams_stats.csv', encoding='utf-8')
teams_values = pd.read_csv(f'{data_dir}/all_teams_values.csv', encoding='utf-8')

In [5]:
def normalize_names(name):
    """이름을 정규화하여 비교가능한 형태로 만듦."""
    name = unicodedata.normalize('NFKD', name).encode('ascii', errors='ignore').decode('ascii')
    name = re.sub(r'[^\w\s]', '', name).lower()
    name_parts = name.split()
    name_parts.sort()
    return ' '.join(name_parts)

def apply_aliases(name):
    """별칭을 적용하여 데이터의 일관성을 높임."""
    normalized_name = name.strip().title()
    return aliases.get(normalized_name, normalized_name)

# 데이터에 별칭 먼저 적용
players_stats['player_name'] = players_stats['player_name'].apply(apply_aliases)
players_values['Name'] = players_values['Name'].apply(apply_aliases)

# 정규화된 이름 적용
players_stats['normalized_name'] = players_stats['player_name'].apply(normalize_names)
players_values['normalized_name'] = players_values['Name'].apply(normalize_names)

# 병합 진행
merged_data = pd.merge(players_stats, players_values[['normalized_name', 'Position']], on='normalized_name', how='left')
merged_data['position'] = merged_data['Position']

# 결과 출력
print(merged_data['position'].unique())
nan_positions = merged_data[merged_data['position'].isnull()]
print(nan_positions[['player_name', 'team_name', 'position']])

['Goalkeeper' 'Attacking Midfield' 'Centre-Forward' 'Right-Back'
 'Right Winger' 'Left Winger' 'Defensive Midfield' 'Central Midfield'
 'Centre-Back' 'Left-Back' 'Left Midfield' 'Second Striker'
 'Right Midfield' nan]
     player_name team_name position
2008              Sassuolo      NaN
2126               Bologna      NaN


In [6]:
merged_data = merged_data.dropna(subset=['position'])
position_cols = ['position_1', 'position_2', 'position_3', 'position_4', 'position_5', 'position_6']
merged_data = merged_data.drop(columns=['position_1', 'position_2', 'position_3', 'position_4', 'position_5', 'position_6', 'normalized_name', 'Position'], errors='ignore')

merged_data.head(10)

Unnamed: 0,player_name,team_name,age,position,Apps,Mins,Goals,Assists,Yel,Red,...,Unsuccessful touches,Average_per_pass,Crosses_per_game,Longpass_per_game,Through_ball_per_game,xG,xGDiff,xGPerNinety,totalShots,xGPerShot
0,Asmir Begovic,Everton,36,Goalkeeper,1,90,0,0,0,0,...,0.0,25.0,0.0,5.0,0.0,,,,,
1,Kevin De Bruyne,Manchester City,32,Attacking Midfield,28(4),2425,7,16,1,0,...,1.3,42.4,2.0,2.5,0.5,4.93,2.07,0.18,65.0,0.08
2,Stefan Ortega,Manchester City,31,Goalkeeper,3,270,0,0,0,0,...,0.0,50.3,0.0,7.3,0.0,,,,,
3,Erling Haaland,Manchester City,23,Centre-Forward,33(2),2779,36,8,5,0,...,1.5,13.7,0.0,0.1,0.1,30.88,5.12,1.0,123.0,0.25
4,Harry Kane,Tottenham,30,Centre-Forward,38,3408,30,3,6,0,...,1.9,22.4,0.2,1.9,0.3,22.49,7.51,0.59,130.0,0.17
5,Bruno Fernandes,Manchester United,29,Attacking Midfield,37,3320,8,8,6,0,...,1.1,50.1,0.9,3.1,0.6,8.81,-0.81,0.24,92.0,0.1
6,Gabriel Jesus,Arsenal,27,Centre-Forward,24(2),2075,11,6,6,0,...,3.4,25.9,0.1,0.6,0.1,14.24,-3.24,0.62,77.0,0.18
7,Kieran Trippier,Newcastle,33,Right-Back,38,3348,1,7,5,0,...,0.7,54.3,3.6,3.9,0.2,0.88,0.12,0.02,14.0,0.06
8,Bukayo Saka,Arsenal,22,Right Winger,37(1),3195,14,11,6,0,...,2.1,32.5,1.0,0.9,0.0,10.21,3.79,0.29,89.0,0.11
9,Solly March,Brighton,29,Right Winger,31(2),2728,7,7,2,0,...,1.7,33.4,1.1,2.1,0.1,9.31,-2.31,0.31,74.0,0.13


In [7]:
players_details['player_name'] = merged_data['player_name']
players_details['team_name'] = merged_data['team_name']
players_details = players_details.drop(columns=['normalize_names', 'normalized_name'], errors='ignore')
players_details.head(10)

Unnamed: 0,player_name,team_name,TotalTackles,DribbledPast,TotalAttemptedTackles,Total_Interception,Fouled,Fouls,Yellow,Red,...,Total_Key passes,Long,Short,Cross,Corner,Throughball,Freekick,Throwin,Other,Total_Assists
0,Asmir Begovic,Everton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Kevin De Bruyne,Manchester City,0.9,0.8,1.7,0.3,0.9,0.6,0.0,0.0,...,3.1,0.8,2.2,0.2,0.0,0.1,0.0,0.0,0.2,0.5
2,Stefan Ortega,Manchester City,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Erling Haaland,Manchester City,0.1,0.1,0.2,0.1,0.7,0.9,0.1,0.0,...,0.9,0.0,0.9,0.0,0.0,0.1,0.0,0.0,0.2,0.2
4,Harry Kane,Tottenham,0.4,0.4,0.8,0.1,1.5,0.8,0.2,0.0,...,1.5,0.3,1.2,0.0,0.0,0.0,0.0,0.0,0.1,0.1
5,Bruno Fernandes,Manchester United,1.8,1.1,3.0,0.7,1.1,1.1,0.2,0.0,...,3.2,0.6,2.6,0.0,0.0,0.1,0.0,0.0,0.1,0.2
6,Gabriel Jesus,Arsenal,1.3,0.7,1.9,0.4,2.2,1.7,0.2,0.0,...,1.2,0.0,1.2,0.0,0.0,0.0,0.0,0.0,0.2,0.2
7,Kieran Trippier,Newcastle,1.9,1.2,3.0,1.1,0.8,1.3,0.1,0.0,...,2.9,1.7,1.2,0.1,0.0,0.0,0.1,0.0,0.0,0.2
8,Bukayo Saka,Arsenal,1.7,0.6,2.3,0.4,1.7,1.1,0.2,0.0,...,2.0,0.5,1.5,0.1,0.1,0.0,0.0,0.0,0.2,0.3
9,Solly March,Brighton,1.6,0.6,2.2,0.7,1.2,1.4,0.1,0.0,...,2.0,0.4,1.6,0.1,0.0,0.0,0.0,0.0,0.1,0.2


In [8]:
merged_data.to_csv('./Leagues/all_players_stats.csv', index=False)
players_details.to_csv('./Leagues/all_players_stats_details.csv', index=False)

  values = values.astype(str)
