In [646]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [647]:
df = pd.read_csv('data.csv')

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

In [648]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [649]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [650]:
from sklearn.model_selection import train_test_split

#print(X.columns)

X = X.drop(columns=["body_type_Lean", "body_type_Normal", "body_type_Stocky"], errors='ignore')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

## Linear Regression

In [651]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from category_encoders import TargetEncoder

pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', LinearRegression())
])
pipeline.fit(X_train, y_train)

In [652]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.852
MAE: 2.237
R2: 0.816

Test set:
MSE: 9.208
MAE: 2.280
R2: 0.812


In [653]:
theta = pipeline.named_steps['model'].coef_
columns = X.columns

#temp = pd.DataFrame({'feature': columns, 'theta': theta})
#temp
print(theta)

[ 8.29627351e-01 -7.62715776e-02  2.97902486e-01  1.01470047e+00
 -1.12130420e-01  1.33987031e-01  9.64542287e-01 -3.21448687e-01
  5.65491408e-01  1.43028465e+00  1.96585635e+00  1.05849541e-01
  3.29320364e-01  3.87210697e-02 -6.17148548e-02 -4.09694011e-01
  3.09802330e+00  6.01291316e-01  4.97292257e-01  3.48204983e-01
 -5.48611791e-03 -1.46181898e-02  1.41467300e-01  6.57149519e-01
  6.18538765e-01 -1.02326517e-01  3.53546097e-01  9.22064105e-01
  1.27557858e-01  9.91168862e-01  2.10912180e-01  1.13699872e+00
  5.73841407e-01 -3.12571255e-01  2.64807407e-02  1.04502888e+00
 -4.34333844e-01  2.14775649e-02 -8.94089102e-01  8.71789931e+00
 -4.68453994e-01 -3.98385223e-02  4.04729147e-02 -9.07846850e-02
 -4.11413133e-01 -1.96865279e-02  3.90018893e-02 -5.85170939e-02
  5.37962859e-02]


In [654]:
cv_evaluate(pipeline, X, y)

test_r2: [0.81342371 0.81659877 0.81926601 0.80738416 0.81399566]
test_r2 trung bình: 0.814 ± 0.004

test_mae: [2.26648852 2.23626116 2.22557808 2.28498719 2.24713556]
test_mae trung bình: 2.252 ± 0.021

test_mse: [9.07239081 8.82023878 8.74937859 9.09745143 9.13276338]
test_mse trung bình: 8.974 ± 0.158



# Ridge Regression

In [655]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [656]:
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [657]:
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=0.3))
])

model.fit(X_train, y_train)

In [658]:
mse_train, mae_train, r2_train = evaluate_model(pipeline, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)

mse_test, mae_test, r2_test = evaluate_model(pipeline, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.852
MAE: 2.237
R2: 0.816

Test set:
MSE: 9.208
MAE: 2.280
R2: 0.812


In [659]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

model = Pipeline([
    ('encoder', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('model', Ridge())
])

param_grid = {
    'model__alpha': [10]
}

grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [660]:
grid_search.best_params_, grid_search.best_estimator_
print(grid_search.best_params_['model__alpha'])

10


In [661]:
model = grid_search.best_estimator_

mse_train, mae_train, r2_train = evaluate_model(model, X_train, y_train)
print('Training set:')
print_metrics(mse_train, mae_train, r2_train)
y_pred = model.predict(X_train)

n = len(y_train)
mse_part = (1 / (2 * n)) * np.sum((y_train - y_pred) ** 2)
alpha = grid_search.best_params_['model__alpha']
l2_penalty_part = alpha * np.sum(model.named_steps['model'].coef_ ** 2)

loss = mse_part + l2_penalty_part

print("1/(2n) * ||y - Xw||_2^2 (MSE part):", mse_part)
print("α * ||w||_2^2 (Regularization part):", l2_penalty_part)
print("Total Loss:", loss)

mse_test, mae_test, r2_test = evaluate_model(model, X_test, y_test)
print('\nTest set:')
print_metrics(mse_test, mae_test, r2_test)

Training set:
MSE: 8.852
MAE: 2.238
R2: 0.816
1/(2n) * ||y - Xw||_2^2 (MSE part): 4.426158895806824
α * ||w||_2^2 (Regularization part): 1013.8862137569616
Total Loss: 1018.3123726527684

Test set:
MSE: 9.205
MAE: 2.281
R2: 0.812


In [662]:
print(model.named_steps['model'].coef_)

[ 8.34407208e-01 -7.24211914e-02  3.00361844e-01  1.02060634e+00
 -1.11457359e-01  1.35077285e-01  9.66968309e-01 -3.18816068e-01
  5.61955002e-01  1.41246049e+00  1.96010457e+00  1.07455500e-01
  3.30030538e-01  3.99969591e-02 -6.38022369e-02 -3.96381646e-01
  3.06074133e+00  5.96517116e-01  4.95456663e-01  3.49889789e-01
 -6.90270967e-03 -1.36750985e-02  1.46376691e-01  6.53703779e-01
  6.21713998e-01 -9.89941497e-02  3.52200073e-01  9.24250666e-01
  1.23162932e-01  9.97871164e-01  2.07090874e-01  1.13091472e+00
  5.59636726e-01 -3.06545143e-01  2.43393931e-02  1.03583599e+00
 -4.34696353e-01  2.14598470e-02 -8.95864526e-01  8.65582476e+00
 -4.68030878e-01 -4.14906034e-02  4.03268016e-02 -9.04504632e-02
 -4.11090529e-01 -2.12568512e-02  3.85145770e-02 -5.83202600e-02
  5.17456393e-02]


In [663]:
theta = model.named_steps['model'].coef_
sum = 0
for i in range(len(theta)):
    sum += theta[i] * theta[i]
print(sum)

101.38862137569616


In [664]:
cv_evaluate(model, X, y)

test_r2: [0.81347171 0.81653202 0.81921792 0.80740658 0.81396285]
test_r2 trung bình: 0.814 ± 0.004

test_mae: [2.26704461 2.2373819  2.22684157 2.2855219  2.24826175]
test_mae trung bình: 2.253 ± 0.021

test_mse: [9.07005689 8.82344902 8.75170639 9.09639269 9.13437466]
test_mse trung bình: 8.975 ± 0.156



In [665]:
import joblib

joblib.dump(model, 'linear_reg_model.pkl')
print("Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'")

Mô hình Linear Regression đã được lưu thành công dưới tên 'linear_reg_model.pkl'
