In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data.csv')

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Linear Regression

In [6]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)

In [7]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 9.079
MAE: 2.266
R2: 0.812

Test set:
MSE: 9.349
MAE: 2.308
R2: 0.809


In [8]:
theta = pipeline.named_steps['model'].coef_
columns = X.columns

temp = pd.DataFrame({'feature': columns, 'theta': theta})
temp

Unnamed: 0,feature,theta
0,age,0.8803261
1,height_cm,-0.06508491
2,weight_kgs,0.3615221
3,nationality,1.023738
4,preferred_foot,-0.1294101
5,weak_foot(1-5),0.1518344
6,skill_moves(1-5),0.9166298
7,crossing,-0.3602333
8,finishing,0.6105369
9,heading_accuracy,1.335366


In [9]:
theta

array([ 8.80326113e-01, -6.50849110e-02,  3.61522086e-01,  1.02373776e+00,
       -1.29410103e-01,  1.51834358e-01,  9.16629819e-01, -3.60233315e-01,
        6.10536856e-01,  1.33536595e+00,  1.91337923e+00,  1.03015556e-01,
        3.54405623e-02,  3.11224009e-03, -1.08027129e-01, -2.85039778e-01,
        2.47607305e+00,  6.98495984e-01,  4.40867421e-01,  4.32161434e-01,
        1.21639898e-01, -2.18483403e-02,  1.84182789e-01,  5.08584926e-01,
        7.35255374e-01, -8.74040119e-02,  3.43968850e-01,  1.02069986e+00,
        1.93101747e-01,  9.62865238e-01,  3.16726898e-01,  1.10728338e+00,
        5.15569248e-01, -3.11069355e-01,  6.36720894e+11,  6.54886920e+11,
        3.17801231e+11,  2.14979420e-02,  1.00917449e+00, -4.53002982e-01,
        1.50359753e-02, -9.13116605e-01,  7.66116599e+00, -4.61570711e-01,
       -4.89653955e-02,  4.22309624e-02, -8.96530258e-02, -3.95066433e-01,
       -2.41174645e-02,  3.92449049e-02, -6.00078814e-02,  3.51667593e-02])

In [10]:
sum = 0
for i in range(len(theta)):
    sum += theta[i] * theta[i]
print(sum)

9.352879977381671e+23


In [11]:
cv_evaluate(pipeline, X, y)

test_r2: [0.81092911 0.80917166 0.81567524 0.80135524 0.81027756]
test_r2 trung bình: 0.809 ± 0.005

test_mae: [2.29074675 2.27327367 2.2482197  2.31870906 2.27745167]
test_mae trung bình: 2.282 ± 0.023

test_mse: [9.19369242 9.17742781 8.92320872 9.38220398 9.31532153]
test_mse trung bình: 9.198 ± 0.157



# Ridge Regression

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [13]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('ridge', Ridge(alpha=0.3))
])

model.fit(X_train, y_train)

In [15]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 9.079
MAE: 2.266
R2: 0.812

Test set:
MSE: 9.349
MAE: 2.308
R2: 0.809


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

param_grid = {
    'ridge__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [17]:
grid_search.best_params_, grid_search.best_estimator_

({'ridge__alpha': 0.0001},
 Pipeline(steps=[('encoder', TargetEncoder(cols=['nationality'])),
                 ('scaler', StandardScaler()), ('ridge', Ridge(alpha=0.0001))]))

In [18]:
model = grid_search.best_estimator_

mse_train, mae_train, r2_train = evaluate_model(model, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(model, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 9.079
MAE: 2.266
R2: 0.812

Test set:
MSE: 9.349
MAE: 2.308
R2: 0.809


In [19]:
cv_evaluate(model, X, y)

test_r2: [0.81092661 0.80914516 0.81567738 0.80137169 0.8109129 ]
test_r2 trung bình: 0.810 ± 0.005

test_mae: [2.29069925 2.2736317  2.24819636 2.31862866 2.27001986]
test_mae trung bình: 2.280 ± 0.023

test_mse: [9.19381399 9.17870223 8.92310474 9.38142716 9.28412609]
test_mse trung bình: 9.192 ± 0.153



In [20]:
theta = pipeline.named_steps['model'].coef_
columns = X.columns

temp = pd.DataFrame({'feature': columns, 'theta': theta})
temp

Unnamed: 0,feature,theta
0,age,0.8803261
1,height_cm,-0.06508491
2,weight_kgs,0.3615221
3,nationality,1.023738
4,preferred_foot,-0.1294101
5,weak_foot(1-5),0.1518344
6,skill_moves(1-5),0.9166298
7,crossing,-0.3602333
8,finishing,0.6105369
9,heading_accuracy,1.335366


In [21]:
import joblib

joblib.dump(model, 'linear_reg_model.pkl')
print("Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'")

Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'
