In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# =============================================
# ‚úÖ 1. Load data
# =============================================
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Preserve test IDs for final submission
test_ids = test['id'].copy()

# =============================================
# ‚úÖ 2. Encode target labels
# =============================================
target_col = 'WeightCategory'
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train[target_col])

# =============================================
# ‚úÖ 3. Prepare features
# =============================================
# Drop target and ID from training set
X = train.drop(columns=[target_col, 'id'])

# Drop ID from test set
test_features = test.drop(columns=['id'])

# =============================================
# ‚úÖ 4. One-hot encode categorical features
# =============================================
X = pd.get_dummies(X)
test_features = pd.get_dummies(test_features)

# Align columns between train and test
X, test_features = X.align(test_features, join='left', axis=1, fill_value=0)

# =============================================
# ‚úÖ 5. Scale features
# =============================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_features)

# =============================================
# ‚úÖ 6. Train XGBoost Classifier
# =============================================
from xgboost import XGBClassifier
import numpy as np

# Define number of classes from your label set
num_classes = len(np.unique(y))  # Ensure 'y' is defined earlier

model = XGBClassifier(
    n_estimators=582,
    learning_rate=0.035,  # 'eta' in param dict
    max_depth=9,
    subsample=0.476,
    colsample_bytree=0.55,
    gamma=0.591,
    min_child_weight=2,
    reg_alpha=0.449,
    reg_lambda=2.0,
    random_state=42,
    eval_metric="mlogloss",
    use_label_encoder=False,
    n_jobs=-1
)

print("üöÄ Training XGBoost model...")
model.fit(X_scaled, y)

# =============================================
# ‚úÖ 7. Evaluate Training Accuracy
# =============================================
train_pred = model.predict(X_scaled)
train_accuracy = accuracy_score(y, train_pred)
print(f"‚úÖ Training Accuracy: {train_accuracy * 100:.2f}%")

# =============================================
# ‚úÖ 8. Predict on Test Set
# =============================================
test_pred_numeric = model.predict(test_scaled)
test_pred_labels = label_encoder.inverse_transform(test_pred_numeric)

# =============================================
# ‚úÖ 9. Save Predictions
# =============================================
submission = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': test_pred_labels
})

submission.to_csv('submission.csv', index=False)
print("üìÅ submission.csv saved successfully with ID and WeightCategory columns!")

üöÄ Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


‚úÖ Training Accuracy: 94.93%
üìÅ submission.csv saved successfully with ID and WeightCategory columns!


In [3]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np # Import numpy

# Split training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "multi:softprob",
    "num_class": len(np.unique(y_train)),
    "eval_metric": "mlogloss",
    "learning_rate": 0.025,
    "max_depth": 7,
    "subsample": 0.75,
    "colsample_bytree": 0.8,
    "gamma": 0.5,
    "min_child_weight": 3,
    "reg_alpha": 0.4493,
    "reg_lambda": 1.0,
    "random_state": 42
}


model = xgb.train(
    params,
    dtrain,
    num_boost_round=600,
    evals=[(dval, "validation")],
    early_stopping_rounds=20,
    verbose_eval=True
)

# Evaluate validation performance
val_pred_proba = model.predict(dval) # Predict probabilities on DMatrix
val_pred = np.argmax(val_pred_proba, axis=1) # Get predicted class labels from probabilities

val_accuracy = accuracy_score(y_val, val_pred)
print(f"üéØ Validation Accuracy: {val_accuracy * 100:.4f}%")
print("\nüìä Classification Report:\n", classification_report(y_val, val_pred))

[0]	validation-mlogloss:1.88597
[1]	validation-mlogloss:1.82740
[2]	validation-mlogloss:1.77202
[3]	validation-mlogloss:1.72361
[4]	validation-mlogloss:1.67729
[5]	validation-mlogloss:1.63198
[6]	validation-mlogloss:1.59079
[7]	validation-mlogloss:1.55142
[8]	validation-mlogloss:1.51458
[9]	validation-mlogloss:1.47666
[10]	validation-mlogloss:1.44122
[11]	validation-mlogloss:1.40730
[12]	validation-mlogloss:1.37906
[13]	validation-mlogloss:1.35172
[14]	validation-mlogloss:1.32293
[15]	validation-mlogloss:1.29564
[16]	validation-mlogloss:1.27059
[17]	validation-mlogloss:1.24550
[18]	validation-mlogloss:1.22101
[19]	validation-mlogloss:1.19840
[20]	validation-mlogloss:1.17560
[21]	validation-mlogloss:1.15219
[22]	validation-mlogloss:1.12915
[23]	validation-mlogloss:1.10844
[24]	validation-mlogloss:1.08980
[25]	validation-mlogloss:1.06939
[26]	validation-mlogloss:1.05041
[27]	validation-mlogloss:1.03145
[28]	validation-mlogloss:1.01202
[29]	validation-mlogloss:0.99506
[30]	validation-mlog

In [None]:
param_sets = {
    "Tuned C": {
        "n_estimators": 650,
        "learning_rate": 0.02,
        "max_depth": 8,
        "subsample": 0.75,
        "colsample_bytree": 0.8,
        "gamma": 0.4,
        "min_child_weight": 2,
        "reg_alpha": 0.6,
        "reg_lambda": 1.2,
    },
    "Tuned D": {
        "n_estimators": 700,
        "learning_rate": 0.018,
        "max_depth": 7,
        "subsample": 0.7,
        "colsample_bytree": 0.75,
        "gamma": 0.3,
        "min_child_weight": 3,
        "reg_alpha": 0.5,
        "reg_lambda": 1.0,
    },
    "Tuned E": {
        "n_estimators": 600,
        "learning_rate": 0.015,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree": 0.85,
        "gamma": 0.2,
        "min_child_weight": 1,
        "reg_alpha": 0.4,
        "reg_lambda": 0.8,
    }
}


In [None]:
for name, params in param_sets.items():
    model = XGBClassifier(
        objective="multi:softprob",
        num_class=len(np.unique(y_train)),
        eval_metric="mlogloss",
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1,
        **params
    )
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    acc = accuracy_score(y_val, val_pred)
    print(f"{name} Validation Accuracy: {acc * 100:.4f}%")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Tuned C Validation Accuracy: 90.4409%


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Tuned D Validation Accuracy: 90.6019%


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Tuned E Validation Accuracy: 90.4088%


In [2]:
# =============================================
# ‚úÖ Imports and Setup
# =============================================
!pip install xgboost scikit-learn pandas --quiet

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_sample_weight
import pandas as pd
import numpy as np
import warnings
import os

warnings.filterwarnings("ignore")

print("üìú Final model script initialized.")

# =============================================
# ‚úÖ Load Data
# =============================================
print("Loading data...")
try:
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
except FileNotFoundError:
    print("Error: train.csv or test.csv not found.")
    # In a real script, you might exit here
    # exit()

X = train_df.drop(columns=["WeightCategory", "id"])
y_labels = train_df["WeightCategory"]
X_test = test_df.drop(columns=["id"])
test_ids = test_df["id"].copy()

# =============================================
# ‚úÖ Feature Engineering & Preprocessing
# =============================================
print("‚öôÔ∏è Processing features...")

# --- 1. DEFINE FEATURE ENGINEERING FUNCTION ---
def create_features(df):
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)
    df['Activity_Ratio'] = df['FAF'] / (df['TUE'] + 1e-6)
    df['Veg_per_Meal'] = df['FCVC'] / (df['NCP'] + 1e-6)
    df['Age_sq'] = df['Age'] ** 2
    return df

# --- 2. APPLY FEATURE ENGINEERING ---
print("Creating new features...")
X = create_features(X)
X_test = create_features(X_test)
print("New features created: ['BMI', 'Activity_Ratio', 'Veg_per_Meal', 'Age_sq']")

# --- 3. PREPROCESSING ---
# 3a. Encode Target
le_target = LabelEncoder()
y = le_target.fit_transform(y_labels)
num_classes = len(le_target.classes_)

# 3b. Combine train/test for consistent dummy encoding
combined_df = pd.concat([X, X_test], axis=0)
combined_df = pd.get_dummies(combined_df, drop_first=True)

# 3c. Separate back into train/test
X = combined_df.iloc[:len(X)]
X_test = combined_df.iloc[len(X):]

# 3d. Scale numerical features
numeric_cols = X.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

# 3e. Ensure column names are strings (for XGBoost)
X.columns = X.columns.astype(str)
X_test.columns = X_test.columns.astype(str)
print("‚úÖ Features processed and scaled.")

# =============================================
# ‚úÖ Train Final Model & Create Submission
# =============================================
print("\nüèÅ Starting final model training...")

# --- 1. Hard-code your BEST parameters from Trial 203 ---
best_params = {
    "objective": "multi:softmax",
    "num_class": num_classes,
    "eval_metric": "mlogloss",
    "verbosity": 1, # Set to 1 to see it train
    "seed": 42,
    "tree_method": "hist",

    # These are from your 'Best is trial 203' log
    "eta": 0.06794898540089923,
    "max_depth": 5,
    "min_child_weight": 1,
    "subsample": 0.7953386026614067,
    "colsample_bytree": 0.6417557307124112,
    "gamma": 0.6758030233248484,
    "alpha": 0.11467350881393687,
    "lambda": 1.407102951104466
}

# This was the 'n_estimators' *budget* for your best trial.
# We will use this as the number of trees for the final model.
best_n_estimators = 783

print("\nFinal Model Parameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print(f"  n_estimators (num_boost_round): {best_n_estimators}")

# --- 2. Train on FULL Data ---
print("\nüèãÔ∏è Training final model on ALL data...")
# Create DMatrix on the *entire* training set
dtrain_full = xgb.DMatrix(X, label=y, weight=compute_sample_weight(class_weight='balanced', y=y))
dtest = xgb.DMatrix(X_test)

final_model_bst = xgb.train(
    best_params,
    dtrain_full,
    num_boost_round=best_n_estimators, # Use the number of estimators from your best trial
    verbose_eval=50 # Print progress every 50 trees
)

# --- 3. Generate Predictions ---
print("\nüìä Generating predictions...")
predictions_numeric = final_model_bst.predict(dtest).astype(int)

# Decode predictions back to original labels
predictions_labels = le_target.inverse_transform(predictions_numeric)

submission = pd.DataFrame({
    "id": test_ids,
    "WeightCategory": predictions_labels
})
submission_filename = "submission_FINAL_v1.csv"
submission.to_csv(submission_filename, index=False)

print(f"üìÅ {submission_filename} saved successfully! Submit this file.")

üìú Final model script initialized.
Loading data...
‚öôÔ∏è Processing features...
Creating new features...
New features created: ['BMI', 'Activity_Ratio', 'Veg_per_Meal', 'Age_sq']
‚úÖ Features processed and scaled.

üèÅ Starting final model training...

Final Model Parameters:
  objective: multi:softmax
  num_class: 7
  eval_metric: mlogloss
  verbosity: 1
  seed: 42
  tree_method: hist
  eta: 0.06794898540089923
  max_depth: 5
  min_child_weight: 1
  subsample: 0.7953386026614067
  colsample_bytree: 0.6417557307124112
  gamma: 0.6758030233248484
  alpha: 0.11467350881393687
  lambda: 1.407102951104466
  n_estimators (num_boost_round): 783

üèãÔ∏è Training final model on ALL data...

üìä Generating predictions...
üìÅ submission_FINAL_v1.csv saved successfully! Submit this file.
