# Hyperparameter Tuning

Tối ưu siêu tham số cho các mô hình baseline bằng GridSearchCV và RandomizedSearchCV.


In [1]:
# Import thư viện
import pandas as pd, numpy as np, pickle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


## 1. Load & Tiền xử lý dữ liệu


In [2]:
train = pd.read_csv("train.csv")

# Xử lý thiếu
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)

# Mã hóa biến phân loại
train["Sex"] = LabelEncoder().fit_transform(train["Sex"])
train["Embarked"] = LabelEncoder().fit_transform(train["Embarked"])

# Chia đặc trưng và nhãn
X = train.drop(["Survived", "Name", "Ticket", "Cabin", "PassengerId"], axis=1)
y = train["Survived"]

# Chia train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)


## 2. Định nghĩa mô hình & tham số tuning


In [3]:
models = {
    "LogReg": (LogisticRegression(max_iter=1000), {"C": [0.1, 1, 10]}),
    "RF": (RandomForestClassifier(), {"n_estimators": [100, 200], "max_depth": [5, 10, None]}),
    "SVM": (SVC(), {"C": [0.5, 1, 10], "kernel": ["rbf", "linear"]}),
    "XGB": (XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
            {"n_estimators": [100, 200], "max_depth": [3, 5], "learning_rate": [0.05, 0.1]})
}


## 3. Tuning mô hình bằng RandomizedSearchCV


In [4]:
results = {}
best_model, best_score = None, 0

for name, (model, params) in models.items():
    print(f"Tuning {name} ...")
    gs = RandomizedSearchCV(model, params, n_iter=5, cv=3, scoring="accuracy", n_jobs=-1)
    gs.fit(X_train, y_train)
    pred = gs.predict(X_val)
    acc = accuracy_score(y_val, pred)
    results[name] = acc
    print(f"→ {name} accuracy: {acc:.4f}")
    
    if acc > best_score:
        best_score, best_model = acc, gs.best_estimator_

print("\n📊 Tổng hợp kết quả:")
for k, v in results.items():
    print(f"{k}: {v:.4f}")

print("\n✅ Best model:", best_model.__class__.__name__, "→", round(best_score, 4))


Tuning LogReg ...




→ LogReg accuracy: 0.7989
Tuning RF ...
→ RF accuracy: 0.8045
Tuning SVM ...
→ SVM accuracy: 0.8156
Tuning XGB ...
→ XGB accuracy: 0.8156

📊 Tổng hợp kết quả:
LogReg: 0.7989
RF: 0.8045
SVM: 0.8156
XGB: 0.8156

✅ Best model: SVC → 0.8156


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


## 4. Lưu mô hình tốt nhất


In [5]:
with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("✅ Saved best model → best_model.pkl")


✅ Saved best model → best_model.pkl
