In [686]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [687]:
df = pd.read_csv('data.csv')
body_type_columns = ['body_type_Lean', 'body_type_Normal', 'body_type_Stocky']
df['body_type'] = np.argmax(df[body_type_columns].values, axis=1)

X = df.drop('overall_rating', axis=1)
X = X.drop(columns=["body_type_Lean", "body_type_Normal", "body_type_Stocky"], errors='ignore')
y = df['overall_rating']

In [688]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [689]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [690]:
from sklearn.model_selection import train_test_split

#print(X.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Linear Regression

In [691]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)

In [692]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.836
MAE: 2.236
R2: 0.817

Test set:
MSE: 9.190
MAE: 2.281
R2: 0.812


In [693]:
theta = pipeline.named_steps['model'].coef_
columns = X.columns

temp = pd.DataFrame({'feature': columns, 'theta': theta})
temp
#print(theta)

Unnamed: 0,feature,theta
0,age,0.837695
1,height_cm,-0.133549
2,weight_kgs,0.354859
3,nationality,1.019034
4,preferred_foot,-0.111613
5,weak_foot(1-5),0.134735
6,skill_moves(1-5),0.964402
7,crossing,-0.317058
8,finishing,0.567776
9,heading_accuracy,1.421332


In [694]:
cv_evaluate(pipeline, X, y)

test_r2: [0.813751   0.81710197 0.8192116  0.80802253 0.81406947]
test_r2 trung bình: 0.814 ± 0.004

test_mae: [2.26732246 2.23514115 2.22841681 2.2786534  2.24886649]
test_mae trung bình: 2.252 ± 0.019

test_mse: [9.05647634 8.79603868 8.75201261 9.06730068 9.12913954]
test_mse trung bình: 8.960 ± 0.155



# Ridge Regression

In [695]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [696]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [697]:
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=0.3))
])

model.fit(X_train, y_train)

In [698]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.836
MAE: 2.236
R2: 0.817

Test set:
MSE: 9.190
MAE: 2.281
R2: 0.812


In [699]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge())
])

param_grid = {
    'model__alpha': [10]
}

grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [700]:
grid_search.best_params_, grid_search.best_estimator_
print(grid_search.best_params_['model__alpha'])

10


In [701]:
model = grid_search.best_estimator_

mse_train, mae_train, r2_train = evaluate_model(model, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)
y_pred = model.predict(X_train)

n = len(y_train)
mse_part = (1 / (2 * n)) * np.sum((y_train - y_pred) ** 2)
alpha = grid_search.best_params_['model__alpha']
l2_penalty_part = alpha * np.sum(model.named_steps['model'].coef_ ** 2)

loss = mse_part + l2_penalty_part

print("1/(2n) * ||y - Xw||_2^2 (MSE part):", mse_part)
print("α * ||w||_2^2 (Regularization part):", l2_penalty_part)
print("Total Loss:", loss)

mse_test, mae_test, r2_test = evaluate_model(model, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.837
MAE: 2.237
R2: 0.817
1/(2n) * ||y - Xw||_2^2 (MSE part): 4.418329669388893
α * ||w||_2^2 (Regularization part): 1013.4118233096639
Total Loss: 1017.8301529790527

Test set:
MSE: 9.188
MAE: 2.281
R2: 0.812


In [702]:
print(model.named_steps['model'].coef_)

[ 8.42496017e-01 -1.29582123e-01  3.57218999e-01  1.02493382e+00
 -1.10941823e-01  1.35825003e-01  9.66811341e-01 -3.14458992e-01
  5.64195693e-01  1.40353649e+00  1.94661705e+00  1.01387091e-01
  3.28297256e-01  3.57883363e-02 -5.66135336e-02 -3.93849801e-01
  3.05407331e+00  5.95117113e-01  4.91953116e-01  3.31906580e-01
  1.74222629e-02 -4.05566625e-03  1.49453607e-01  6.47365306e-01
  6.59633519e-01 -9.86547806e-02  3.71938546e-01  9.20918668e-01
  1.21379124e-01  9.95135224e-01  2.04740475e-01  1.12865333e+00
  5.47985513e-01 -3.03532696e-01  2.45860713e-02  1.03323492e+00
 -4.35657960e-01  1.73499203e-02 -8.98057711e-01  8.65471549e+00
 -4.69297840e-01 -4.32066398e-02  3.98864258e-02 -9.00885699e-02
 -4.08974270e-01 -2.20748998e-02  3.71021334e-02 -6.06600143e-02
  5.58431163e-02 -1.42650616e-01]


In [703]:
theta = model.named_steps['model'].coef_
sum = 0
for i in range(len(theta)):
    sum += theta[i] * theta[i]
print(sum)

101.34118233096636


In [704]:
cv_evaluate(model, X, y)

test_r2: [0.81379728 0.81703595 0.81916372 0.80804413 0.81403887]
test_r2 trung bình: 0.814 ± 0.004

test_mae: [2.26784574 2.23612344 2.22974851 2.27917046 2.24993326]
test_mae trung bình: 2.253 ± 0.019

test_mse: [9.05422581 8.79921353 8.75433049 9.06628048 9.13064201]
test_mse trung bình: 8.961 ± 0.153



In [705]:
import joblib

joblib.dump(model, 'linear_reg_model.pkl')
print("Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'")

Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'
