In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

## 1.데이터 로드

In [2]:
file_path = '/home/pervinco/Upstage_Ai_Lab/04_EDA/project/Leagues/all_players_stats_total.csv'
data = pd.read_csv(file_path)

## 2.결측치, 이상치 정리

In [3]:
## Apps 컬럼 정리
def convert_apps(apps):
    total_apps = 0
    parts = re.findall(r'\d+', apps)
    if parts:
        total_apps = sum(map(int, parts))
    return total_apps

data['Apps'] = data['Apps'].apply(convert_apps)

## 결측치 제거
numeric_columns = data.select_dtypes(include=['number']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

## feature로 적용될 컬럼 정리
selected_features = list(data.columns)
selected_features.remove('player_name')
selected_features.remove('team_name')
selected_features.remove('position')

In [4]:
## 이상치 제거
def remove_outliers(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[feature] >= lower_bound) & (df[feature] <= upper_bound)]

cleaned_data = pd.DataFrame()
for name, group in data.groupby('position'):
    for feature in numeric_columns:
        group = remove_outliers(group, feature)
    cleaned_data = pd.concat([cleaned_data, group], axis=0)


In [5]:
grouped_data = data.groupby('position')[selected_features]

for position, group in grouped_data:
    print(position, len(group))

Attacking Midfield 213
Central Midfield 394
Centre-Back 513
Centre-Forward 394
Defensive Midfield 229
Goalkeeper 210
Left Midfield 25
Left Winger 222
Left-Back 207
Right Midfield 24
Right Winger 197
Right-Back 247
Second Striker 14


In [6]:
positions = data['position'].unique()
position_data = {pos: data[data['position'] == pos].copy() for pos in positions}

## 스케일링
scaler = StandardScaler()
for pos in positions:
    position_data[pos][numeric_columns] = scaler.fit_transform(position_data[pos][numeric_columns].values)

## 3.Correlation Matrix

In [7]:
"""
스탯간 상관관계 분석이기 때문에 포지션마다 주요한 스탯이 무엇인지 판단하기엔 애매함.
"""

# position_stats_dict = {}

# for position, group in grouped_data:
#     corr_matrix = group[selected_features].corr()
#     np.fill_diagonal(corr_matrix.values, 0)
    
#     flat_corr = corr_matrix.abs().unstack()
#     sorted_pairs = flat_corr.sort_values(ascending=False)
#     unique_pairs = sorted_pairs.drop_duplicates()
#     top_10_pairs = unique_pairs.head(10)
#     position_stats_dict[position] = top_10_pairs

#     # # 각 포지션별 상관계수 행렬 시각화
#     # plt.figure(figsize=(10, 8))
#     # sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
#     # plt.title(f'Correlation Matrix for {position}')
#     # plt.show()

# for position, top_pairs in position_stats_dict.items():
#     print(f"Position: {position}")
#     print("Top 10 Correlated Stat Pairs:")
#     print(top_pairs)
#     print("\n")

'\n스탯간 상관관계 분석이기 때문에 포지션마다 주요한 스탯이 무엇인지 판단하기엔 애매함.\n'

## 4.PCA

In [8]:
'''
주성분 역시 값이 낮아서 데이터를 대표하는 주성분이라 보기 어려움.
'''

# pca_results = {}
# for pos, df in position_data.items():
#     # 포지션별 수치 데이터 선택
#     features = df.select_dtypes(include=['number'])
    
#     # PCA 모델 생성 및 적용
#     pca = PCA(n_components=2)  # 주성분을 2개로 설정
#     principalComponents = pca.fit_transform(features)
    
#     # 결과 저장
#     pca_results[pos] = {
#         'explained_variance_ratio': pca.explained_variance_ratio_,
#         'components': pca.components_
#     }

#     # 주성분에 대한 설명력과 주요 변수의 계수 출력
#     print(f"Position: {pos}")
#     print("Explained Variance Ratio:", pca.explained_variance_ratio_)
#     print("PCA Components:\n", pca.components_)
#     print()


'\n주성분 역시 값이 낮아서 데이터를 대표하는 주성분이라 보기 어려움.\n'

## 5.Random Forest & Gradient Boosting

In [9]:
X = cleaned_data[selected_features]
y = cleaned_data['position']
y_encoded = LabelEncoder().fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [10]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_importances = rf_model.feature_importances_

print("Random Forest Feature Importances:")
for feature, importance in zip(selected_features, rf_importances):
    print(f"{feature}: {importance:.3f}")

Random Forest Feature Importances:
age: 0.022
Apps: 0.020
Mins: 0.024
Goals: 0.004
Assists: 0.006
Yellow: 0.015
Red: 0.000
SpG: 0.022
PS%: 0.041
AerialsWon: 0.026
MoM: 0.002
Rating: 0.032
Tackles: 0.031
Inter: 0.018
Offsides: 0.013
Clear: 0.055
Dribbles allowed: 0.024
Blocks: 0.028
OwnGoal: 0.000
Key_pass_per_game: 0.025
Dribble: 0.025
Foul_given_game: 0.023
Offside_given_per_game: 0.028
Dispossessed_per_game: 0.026
Unsuccessful touches: 0.026
Passes_per_game: 0.029
Crosses_per_game: 0.032
Longpass_per_game: 0.039
Through_ball_per_game: 0.006
xG: 0.026
xGDiff: 0.022
xGPerNinety: 0.070
totalShots: 0.017
xGPerShot: 0.031
TotalTackles: 0.004
DribbledPast: 0.009
TotalAttemptedTackles: 0.005
Total_Interception: 0.005
Fouled: 0.006
Fouls: 0.007
CaughtOffside: 0.002
Total_Clearances: 0.008
ShotsBlocked: 0.005
CrossesBlocked: 0.001
PassesBlocked: 0.005
Total_Saves: 0.000
SixYardBox_Saves: 0.000
PenaltyArea_Saves: 0.000
OutOfBox_Saves: 0.000
Total_Shots: 0.008
OutOfBox_Shots: 0.004
SixYardBox_S

In [11]:
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)
gb_importances = gb_model.feature_importances_

print("\nGradient Boosting Feature Importances:")
for feature, importance in zip(selected_features, gb_importances):
    print(f"{feature}: {importance:.3f}")


Gradient Boosting Feature Importances:
age: 0.010
Apps: 0.004
Mins: 0.057
Goals: 0.000
Assists: 0.002
Yellow: 0.021
Red: 0.000
SpG: 0.016
PS%: 0.072
AerialsWon: 0.021
MoM: 0.003
Rating: 0.032
Tackles: 0.010
Inter: 0.001
Offsides: 0.027
Clear: 0.101
Dribbles allowed: 0.025
Blocks: 0.005
OwnGoal: 0.000
Key_pass_per_game: 0.026
Dribble: 0.016
Foul_given_game: 0.063
Offside_given_per_game: 0.019
Dispossessed_per_game: 0.001
Unsuccessful touches: 0.006
Passes_per_game: 0.015
Crosses_per_game: 0.021
Longpass_per_game: 0.012
Through_ball_per_game: 0.059
xG: 0.010
xGDiff: 0.041
xGPerNinety: 0.145
totalShots: 0.000
xGPerShot: 0.041
TotalTackles: 0.002
DribbledPast: 0.006
TotalAttemptedTackles: 0.000
Total_Interception: 0.000
Fouled: 0.000
Fouls: 0.033
CaughtOffside: 0.000
Total_Clearances: 0.003
ShotsBlocked: 0.012
CrossesBlocked: 0.004
PassesBlocked: 0.000
Total_Saves: 0.000
SixYardBox_Saves: 0.000
PenaltyArea_Saves: 0.000
OutOfBox_Saves: 0.000
Total_Shots: 0.000
OutOfBox_Shots: 0.002
SixYard