In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, RandomizedSearchCV, train_test_split

df = pd.read_csv('data.csv')

X = df.drop('overall_rating', axis=1)
y = df['overall_rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [2]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    return mse, mae, r2

def print_metrics(mse, mae, r2):
    print(f'MSE: {mse:.3f}')
    print(f'MAE: {mae:.3f}')
    print(f'R2: {r2:.3f}')

In [3]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold

def cv_evaluate(model, X, y):
    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    scoring = {
        'r2': make_scorer(r2_score),
        'mae': make_scorer(mean_absolute_error),
        'mse': make_scorer(mean_squared_error)
    }
    cv_results = cross_validate(model, X, y, cv=cv, scoring=scoring, return_train_score=False)

    for key in cv_results.keys():
        if key in ['fit_time', 'score_time']:
            continue
        print(f"{key}: {cv_results[key]}")
        print(f"{key} trung bình: {cv_results[key].mean():.3f} ± {cv_results[key].std():.3f}")
        print()

In [4]:
pipeline = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('svr', SVR())
])
pipeline.fit(X_train, y_train)

In [5]:
print_metrics(*evaluate_model(pipeline, X_train, y_train))

In [6]:
print_metrics(*evaluate_model(pipeline, X_test, y_test))

In [None]:
cv_evaluate(pipeline, X, y)

## Use validation set to optimize params

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

pipeline = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

In [8]:
param_grid = {
    'svr__C': [0.1, 1, 10],
    'svr__gamma': ['scale', 'auto'],
    'svr__kernel': ['rbf', 'linear', 'poly'],
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, cv=cv, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [9]:
print('Best params:', grid_search.best_params_)
print('Best score:', -grid_search.best_score_)

In [10]:
final_model = grid_search.best_estimator_

final_model.fit(X_train, y_train)

print("=== Đánh Giá Mô Hình SVR Sau Khi GridSearch ===")
print("Training Set:")
print_metrics(*evaluate_model(final_model, X_train, y_train))
print("\nTest Set:")
print_metrics(*evaluate_model(final_model, X_test, y_test))

In [18]:
cv_evaluate(final_model, X, y)

In [11]:
svr = final_model.named_steps['svr']
svr

In [12]:
import joblib

joblib.dump(svr, 'svr_model.pkl')
print("Mô hình SVR đã được lưu thành công dưới tên 'svr_model.pkl'")

In [13]:
import joblib

svr = joblib.load('svr_model.pkl')

print("Mô hình SVR đã được tải lại thành công.")

In [14]:
pipeline = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('svr', svr)
])
pipeline.fit(X_train, y_train)

In [15]:
y_pred = pipeline.predict(X_test)

predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

In [16]:
sorted_predictions_df = predictions_df.reset_index(drop=True)
print(sorted_predictions_df.head(20))

# Bayes optimization

In [6]:
import optuna

def objective(trial):
    params = {
        'C': trial.suggest_float('C', 0.1, 15),
        'gamma': trial.suggest_categorical('gamma', ['scale', 'auto']),
        # 'kernel': trial.suggest_categorical('kernel', ['rbf', 'linear', 'poly'])
    }

    pipeline = Pipeline([
        ('te', TargetEncoder(cols=['nationality'])),
        ('scaler', StandardScaler()),
        ('svr', SVR(**params))
    ])

    cv = KFold(n_splits=5, shuffle=True, random_state=42)

    score = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print(f'Best params: {study.best_params}')

In [None]:
numerical_features = X.select_dtypes(include=[np.number]).columns

In [7]:
best_svr = SVR(**study.best_params)

pipeline = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('svr', best_svr)
])
pipeline.fit(X_train, y_train)

In [4]:

pipeline = Pipeline([
    ('te', TargetEncoder(cols=['nationality'])),
    ('scaler', StandardScaler()),
    ('svr', SVR(C=12.926006289958659, gamma='scale'))
])
pipeline.fit(X_train, y_train)

In [5]:
print('Training Set:')
print_metrics(*evaluate_model(pipeline, X_train, y_train))

print('\nTest Set:')
print_metrics(*evaluate_model(pipeline, X_test, y_test))

In [11]:
import pickle

# Access each step separately
target_encoder = pipeline.named_steps['te']
scaler = pipeline.named_steps['scaler']

# Combine them into a tuple or list
preprocessor = (target_encoder, scaler)

with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Preprocessor đã được lưu thành công dưới tên 'preprocessor.pkl'")

In [9]:
import pickle
with open('pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

print("Mô hình SVR đã được lưu thành công dưới tên 'pipeline.pkl'")

In [8]:
print('Training Set:')
print_metrics(*evaluate_model(pipeline, X_train, y_train))

print('\nTest Set:')
print_metrics(*evaluate_model(pipeline, X_test, y_test))

In [9]:
cv_evaluate(pipeline, X, y)