In [1]:
from tools import *

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[bool_cols] = X_train[bool_cols].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.fit_transform(X_train[col])


In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Assume train_raw and test_raw are pre-loaded DataFrames

# Define features and target
features = [f"x{i}" for i in range(1, 14)]
X_train = train_raw[features].copy()
y_train = train_raw['y'].copy()
X_test = test_raw[features].copy()  # Test set (10,000 datapoints for Kaggle)

# Encode target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Preprocessing: Convert booleans to integers
bool_cols = X_train.select_dtypes(include=['bool']).columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

# Preprocessing: Encode categorical features (if any)
cat_cols = X_train.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# Define individual models
xgb_model = XGBClassifier(
    objective='multi:softprob',  # For multiclass probability output
    learning_rate=0.1,
    max_depth=6,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
gnb_model = GaussianNB()

# Create ensemble VotingClassifier with equal weights
ensemble_model = VotingClassifier(
    estimators=[('xgb', xgb_model), ('gnb', gnb_model)],
    voting='soft',    # Use probabilities to average predictions
    weights=[1, 1]
)

# Evaluate the ensemble with 5-fold Stratified Cross-Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X_train, y_train_encoded):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train_encoded[train_idx], y_train_encoded[val_idx]
    
    ensemble_model.fit(X_tr, y_tr)
    y_val_pred = ensemble_model.predict(X_val)
    acc = accuracy_score(y_val, y_val_pred)
    cv_scores.append(acc)

print(f"Ensemble Model CV Accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Train the final ensemble model on the full training set
ensemble_model.fit(X_train, y_train_encoded)

# Predict on the test set and convert predictions back to original labels
y_test_pred = ensemble_model.predict(X_test)
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Save predictions for Kaggle submission
submission = pd.DataFrame({"id": X_test.index, "y": y_test_pred_labels})
submission.to_csv("submission_ensemble.csv", index=False)
print("Submission file saved as submission_ensemble.csv")


Ensemble Model CV Accuracy: 0.6848 ± 0.0139
Submission file saved as submission_ensemble.csv
