In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
file_dir = './Leagues/all_players_stats_total.csv'
df = pd.read_csv(file_dir)

In [3]:
position_descriptive_stats = df.groupby('position')['Market Value'].describe() ## 포지션별 시장가치의 평균 계산.
position_skewness = df.groupby('position')['Market Value'].skew().rename("Skewness") ## 포지션별 시장가치의 왜도 계산

position_analysis = pd.concat([position_descriptive_stats, position_skewness], axis=1)
position_analysis['Recommended Statistic'] = np.where(abs(position_analysis['Skewness']) <= 0, 'Mean', 'Median') ## 왜도가 0보다 월등히 크면 중앙값이나 최빈값을 사용.

position_analysis[['mean', '50%', 'std', 'Skewness', 'Recommended Statistic']]

Unnamed: 0_level_0,mean,50%,std,Skewness,Recommended Statistic
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Attacking Midfield,13465950.0,5000000.0,19972780.0,2.745622,Median
Central Midfield,10517990.0,5000000.0,14831640.0,3.133665,Median
Centre-Back,9743534.0,4000000.0,13172700.0,2.448681,Median
Centre-Forward,10367350.0,5000000.0,18857260.0,5.409691,Median
Defensive Midfield,11519190.0,6000000.0,16453710.0,2.824203,Median
Goalkeeper,6393467.0,2500000.0,9428695.0,2.396206,Median
Left Midfield,5736364.0,4000000.0,6063752.0,1.233649,Median
Left Winger,13410270.0,6000000.0,17886860.0,2.152588,Median
Left-Back,8481500.0,3500000.0,11277120.0,2.631339,Median
Right Midfield,7589130.0,5000000.0,9220900.0,2.592295,Median


In [8]:
columns = df.columns
print(len(columns))
print(columns)

84
Index(['player_name', 'team_name', 'age', 'position', 'Apps', 'Mins', 'Goals',
       'Assists', 'Yellow', 'Red', 'SpG', 'PS%', 'AerialsWon', 'MoM', 'Rating',
       'Tackles', 'Inter', 'Offsides', 'Clear', 'Dribbles allowed', 'Blocks',
       'OwnGoal', 'Key_pass_per_game', 'Dribble', 'Foul_given_game',
       'Offside_given_per_game', 'Dispossessed_per_game',
       'Unsuccessful touches', 'Passes_per_game', 'Crosses_per_game',
       'Longpass_per_game', 'Through_ball_per_game', 'xG', 'xGDiff',
       'xGPerNinety', 'totalShots', 'xGPerShot', 'TotalTackles',
       'DribbledPast', 'TotalAttemptedTackles', 'Total_Interception', 'Fouled',
       'Fouls', 'CaughtOffside', 'Total_Clearances', 'ShotsBlocked',
       'CrossesBlocked', 'PassesBlocked', 'Total_Saves', 'SixYardBox_Saves',
       'PenaltyArea_Saves', 'OutOfBox_Saves', 'Total_Shots', 'OutOfBox_Shots',
       'SixYardBox_Shots', 'PenaltyArea_Shots', 'Total_Goals',
       'SixYardBox_Goals', 'PenaltyArea_Goals', 'OutOfBox_Goa

In [None]:
columns_to_exclude = [
    'player_name', 'team_name', 
    #'age',
    'position', 'Apps', 
    # 'Mins',
    'Goals',
    'Assists', 'Yellow', 'Red', 'SpG', 
    # 'PS%', 
    'AerialsWon', 'MoM', 'Rating',
    'Tackles', 'Inter', 'Offsides', 'Clear', 'Dribbles allowed', 'Blocks',
    'OwnGoal', 'Key_pass_per_game', 'Dribble', 'Foul_given_game',
    'Offside_given_per_game', 'Dispossessed_per_game',
    'Unsuccessful touches', 'Passes_per_game', 'Crosses_per_game',
    'Longpass_per_game', 'Through_ball_per_game', 
    'Market Value'
]

In [13]:
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()

for col in numeric_features:
    if np.all(df[col] >= 0):
        df[col + '_log'] = np.log1p(df[col])

scaler = MinMaxScaler()
scaled_columns = [col + '_log' for col in numeric_features if col + '_log' in df.columns]
df[scaled_columns] = scaler.fit_transform(df[scaled_columns])

results = {}
positions = df['position'].unique()
for position in positions:
    position_data = df[df['position'] == position]
    high_value_data = position_data[position_data['Market Value'] >= position_data['Market Value'].median()]
    features = high_value_data.drop(columns=columns_to_exclude)
    features_filled = features.fillna(features.median())
    target = high_value_data['Market Value']
    
    X_train, X_test, y_train, y_test = train_test_split(features_filled, target, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    
    results[position] = {'R2': r2, 'Feature Importances': model.feature_importances_}

for position, data in results.items():
    print(f"Position: {position}, R-squared: {data['R2']}")

Position: Goalkeeper, R-squared: 0.18796107551075814
Position: Attacking Midfield, R-squared: 0.5399817994889309
Position: Centre-Forward, R-squared: -0.6787988049549809
Position: Right-Back, R-squared: -0.3034043715566901
Position: Right Winger, R-squared: 0.0869675056712037
Position: Left Winger, R-squared: 0.20741578032672925
Position: Defensive Midfield, R-squared: 0.0057273245595176325
Position: Central Midfield, R-squared: 0.29815366615847305
Position: Centre-Back, R-squared: 0.3468401099176506
Position: Left-Back, R-squared: 0.12304475562312922
Position: Left Midfield, R-squared: -16.940632142857176
Position: Second Striker, R-squared: -0.4384730034722226
Position: Right Midfield, R-squared: -61.241046153846156
