In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [9]:
df = pd.read_csv('data_encoded.csv')

In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [11]:
df.drop(['value_euro', 'wage_euro', 'release_clause_euro',
         'potential', 'composure'], axis=1, inplace=True)

In [12]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Hold out evaluation

In [19]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

pipe = make_pipeline(StandardScaler(), LinearRegression())
pipe.fit(X_train, y_train)

print('Training set')
mse, mae, r2 = evaluate_model(pipe, X_train, y_train)
print_metrics(mse, mae, r2)

print('\nValidation set')
mse, mae, r2 = evaluate_model(pipe, X_val, y_val)
print_metrics(mse, mae, r2)

print('\nTest set')
mse, mae, r2 = evaluate_model(pipe, X_test, y_test)
print_metrics(mse, mae, r2)

Training set
MSE: 6.225
MAE: 1.914
R2: 0.872

Validation set
MSE: 6.083
MAE: 1.926
R2: 0.872

Test set
MSE: 5.879
MAE: 1.891
R2: 0.876


In [30]:
y_val_pred = pipe.predict(X_val)

difference_df = pd.DataFrame({'Actual': y_val, 'Predicted': y_val_pred})
difference_df.head()

Unnamed: 0,Actual,Predicted
420,72,72.366179
10036,68,68.843064
1372,64,66.552082
3315,64,68.636326
8481,65,62.736555


In [32]:
from sklearn.model_selection import cross_val_score

X_cv = pd.concat([X_train, X_val])
y_cv = pd.concat([y_train, y_val])

cv = 10

r2_scores = cross_val_score(pipe, X_cv, y_cv, cv=cv, scoring='r2', n_jobs=-1)
print(f"R^2 scores cho {cv}-fold cross-validation: {r2_scores}")
print(f"R^2 trung bình: {r2_scores.mean():.3f} ± {r2_scores.std():.3f}")
print()

mse_scores = cross_val_score(pipe, X_cv, y_cv, cv=cv, scoring='neg_mean_squared_error', n_jobs=-1)
mse_scores = -mse_scores
print(f"MSE cho {cv}-fold cross-validation: {mse_scores}")
print(f"MSE trung bình: {mse_scores.mean():.3f}" + u"\u00B1" + f" {mse_scores.std():.3f}")
print()

mae_scores = cross_val_score(pipe, X_cv, y_cv, cv=cv, scoring='neg_mean_absolute_error', n_jobs=-1)
mae_scores = -mae_scores
print(f"MAE cho {cv}-fold cross-validation: {mae_scores}")
print(f"MAE trung bình: {mae_scores.mean():.3f} ± {mae_scores.std():.3f}")

R^2 scores cho 10-fold cross-validation: [0.8802915  0.86926951 0.87779882 0.85702945 0.85967123 0.87462044
 0.88025181 0.87173749 0.86939814 0.870879  ]
R^2 trung bình: 0.871 ± 0.007

MSE cho 10-fold cross-validation: [5.92078498 6.49139394 5.97534712 6.77996986 6.82312792 5.99039591
 5.8149739  6.36572211 6.03441237 6.23034493]
MSE trung bình: 6.243± 0.342

MAE cho 10-fold cross-validation: [1.8689107  1.94242839 1.87767146 1.99640923 2.00921622 1.87690571
 1.86048637 1.92814355 1.92078558 1.93929255]
MAE trung bình: 1.922 ± 0.049


# One Hot Encoder + PCA to handle nationality

In [35]:
import pandas as pd
df = pd.read_csv('data.csv')

In [36]:
df.drop(['value_euro', 'wage_euro', 'release_clause_euro',
         'potential', 'composure'], axis=1, inplace=True)

In [37]:
from sklearn.preprocessing import OneHotEncoder

top_20_nationalities = df['nationality'].value_counts().nlargest(20).index
df['nationality_top_20'] = df['nationality'].apply(lambda x: x if x in top_20_nationalities else 'Other')


one_hot_encoder = OneHotEncoder(sparse_output=False, drop='first') 
nationality_encoded = one_hot_encoder.fit_transform(df[['nationality_top_20']])

nationality_encoded_df = pd.DataFrame(nationality_encoded, columns=one_hot_encoder.get_feature_names_out(['nationality_top_20']))
df = pd.concat([df, nationality_encoded_df], axis=1)

df.drop(['nationality', 'nationality_top_20'], axis=1, inplace=True)
df.head()


Unnamed: 0,age,height_cm,weight_kgs,overall_rating,preferred_foot,weak_foot(1-5),skill_moves(1-5),crossing,finishing,heading_accuracy,...,nationality_top_20_Mexico,nationality_top_20_Netherlands,nationality_top_20_Norway,nationality_top_20_Other,nationality_top_20_Poland,nationality_top_20_Portugal,nationality_top_20_Republic of Ireland,nationality_top_20_Spain,nationality_top_20_Sweden,nationality_top_20_United States
0,27,177.606642,76.2,88,1,5,4,88,81,52,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,25,190.5,83.9,88,1,4,5,80,75,75,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,27,162.56,59.0,88,1,4,4,86,77,56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,27,187.96,88.9,88,1,3,2,30,22,83,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27,193.04,92.1,88,1,3,2,53,52,83,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [41]:
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
X_train_scaled_pca = pca.fit_transform(X_train_scaled)
X_test_scaled_pca = pca.transform(X_test_scaled)

In [42]:
from sklearn.linear_model import LinearRegression
linear_regression = LinearRegression()
linear_regression.fit(X_train_scaled_pca, y_train)
y_pred = linear_regression.predict(X_test_scaled_pca)

In [43]:
print('Training set')
mse, mae, r2 = evaluate_model(linear_regression, X_train_scaled_pca, y_train)
print_metrics(mse, mae, r2)

print('\nTest set')
mse, mae, r2 = evaluate_model(linear_regression, X_test_scaled_pca, y_test)
print_metrics(mse, mae, r2)

Training set
MSE: 8.149
MAE: 2.232
R2: 0.832

Test set
MSE: 8.214
MAE: 2.247
R2: 0.829
