In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [40]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Linear Regression

In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)

In [25]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.831
MAE: 2.236
R2: 0.817

Test set:
MSE: 9.181
MAE: 2.280
R2: 0.813


In [26]:
theta = pipeline.named_steps['model'].coef_
columns = X.columns

temp = pd.DataFrame({'feature': columns, 'theta': theta})
temp

Unnamed: 0,feature,theta
0,age,0.8407189
1,height_cm,-0.1282523
2,weight_kgs,0.35146
3,nationality,1.020533
4,preferred_foot,-0.1103418
5,weak_foot(1-5),0.1359875
6,skill_moves(1-5),0.9636842
7,crossing,-0.3176509
8,finishing,0.5694134
9,heading_accuracy,1.423147


In [42]:
cv_evaluate(pipeline, X, y)

test_r2: [0.81395458 0.81714044 0.81936425 0.80816089 0.81402568]
test_r2 trung bình: 0.815 ± 0.004

test_mae: [2.26690808 2.23418077 2.22680253 2.27782133 2.25002711]
test_mae trung bình: 2.251 ± 0.019

test_mse: [9.04657671 8.79418854 8.74462255 9.06076576 9.13128972]
test_mse trung bình: 8.955 ± 0.155



# Ridge Regression

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [44]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [45]:
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=0.3))
])

model.fit(X_train, y_train)

In [46]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.831
MAE: 2.236
R2: 0.817

Test set:
MSE: 9.181
MAE: 2.280
R2: 0.813


In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge())
])

param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [48]:
grid_search.best_params_, grid_search.best_estimator_

({'model__alpha': 0.0001},
 Pipeline(steps=[('encoder', TargetEncoder(cols=['nationality'])),
                 ('scaler', StandardScaler()), ('model', Ridge(alpha=0.0001))]))

In [49]:
model = grid_search.best_estimator_

mse_train, mae_train, r2_train = evaluate_model(model, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(model, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.831
MAE: 2.236
R2: 0.817

Test set:
MSE: 9.181
MAE: 2.280
R2: 0.813


In [50]:
cv_evaluate(model, X, y)

test_r2: [0.81395446 0.81714485 0.81935888 0.80816785 0.81402326]
test_r2 trung bình: 0.815 ± 0.004

test_mae: [2.26690767 2.23422442 2.22684367 2.27772422 2.25001923]
test_mae trung bình: 2.251 ± 0.019

test_mse: [9.04658268 8.79397625 8.74488248 9.06043727 9.13140828]
test_mse trung bình: 8.955 ± 0.155



In [53]:
import joblib

joblib.dump(model, 'linear_reg_model.pkl')
print("Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'")

Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'
