In [3]:
from tools import *

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score


# XGBoost parameters
params = {
    "objective": "multi:softmax",  # Multiclass classification
    "num_class": len(np.unique(y_train)),  # Number of classes
    "eval_metric": "mlogloss",  # Multi-class log loss (good for Kaggle)
    "learning_rate": 0.1,
    "max_depth": 6,
    "n_estimators": 300,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42
}

# K-Fold Cross Validation (Stratified)
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_scores = []
for train_idx, val_idx in kf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]

    # Train XGBoost
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_fold, y_train_fold)

    # Validate model
    y_val_pred = model.predict(X_val_fold)
    acc = accuracy_score(y_val_fold, y_val_pred)
    cv_scores.append(acc)

print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}")

# Train final model on full training data
final_model = xgb.XGBClassifier(**params)
final_model.fit(X_train, y_train)

# Predict on test set
y_test_pred = final_model.predict(X_test)

# Convert predictions back to original labels
y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)

# Save for Kaggle submission
submission = pd.DataFrame({"id": test_raw.index, "y": y_test_pred_labels})
submission.to_csv("submission.csv", index=False)

print("Submission file saved as submission.csv 🚀")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[bool_cols] = X_train[bool_cols].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = le.fit_transform(X_train[col])


Cross-validation accuracy: 0.6752 ± 0.0157
Submission file saved as submission.csv 🚀
