# Ensemble & Evaluation

Thực hiện Voting, Stacking và tạo file `submission.csv`


In [1]:
# Import thư viện
import pandas as pd, numpy as np, pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import VotingClassifier, StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


## 1. Load dữ liệu và mô hình tốt nhất


In [2]:
# Load train/test
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Tiền xử lý tương tự như phần trước
train["Age"].fillna(train["Age"].median(), inplace=True)
train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)
test["Age"].fillna(train["Age"].median(), inplace=True)
test["Fare"].fillna(train["Fare"].median(), inplace=True)
test["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)

le_sex, le_embarked = LabelEncoder(), LabelEncoder()
train["Sex"] = le_sex.fit_transform(train["Sex"])
test["Sex"] = le_sex.transform(test["Sex"])
train["Embarked"] = le_embarked.fit_transform(train["Embarked"])
test["Embarked"] = le_embarked.transform(test["Embarked"])

X = train.drop(["Survived", "Name", "Ticket", "Cabin", "PassengerId"], axis=1)
y = train["Survived"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Chuẩn hóa test
X_test = test.drop(["Name", "Ticket", "Cabin", "PassengerId"], axis=1)
X_test = scaler.transform(X_test)

# Load best_model.pkl
with open("best_model.pkl", "rb") as f:
    best_model = pickle.load(f)
print("✅ Loaded best model:", best_model.__class__.__name__)


✅ Loaded best model: SVC


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Age"].fillna(train["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train["Embarked"].fillna(train["Embarked"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object

## 2. Ensemble models (Voting + Stacking)


In [3]:
# Base models
rf = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", n_estimators=200)
svm = SVC(probability=True)
logreg = LogisticRegression(max_iter=1000)

# Voting (soft)
voting = VotingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('svm', svm), ('lr', logreg)],
    voting='soft'
)

# Stacking
stacking = StackingClassifier(
    estimators=[('rf', rf), ('xgb', xgb), ('svm', svm)],
    final_estimator=LogisticRegression(max_iter=1000)
)


## 3. Huấn luyện và đánh giá mô hình


In [4]:
models = {
    "Best_Model": best_model,
    "Voting": voting,
    "Stacking": stacking
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    results[name] = acc
    print(f"{name}: {acc:.4f}")

best_ens_model = max(results, key=results.get)
print(f"\n🏆 Best ensemble model: {best_ens_model} → {results[best_ens_model]:.4f}")


Best_Model: 0.8156


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Voting: 0.8268


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Stacking: 0.8101

🏆 Best ensemble model: Voting → 0.8268


## 4. Xuất submission.csv


In [5]:
final_model = models[best_ens_model]
final_preds = final_model.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": final_preds
})
submission.to_csv("submission.csv", index=False)

print("✅ Saved submission.csv")
submission.head()


✅ Saved submission.csv


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
