In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [None]:
# Load data
train_data = pd.read_csv('/content/Train_Data.csv')
test_data = pd.read_csv('/content/Test_Data.csv')

# Drop rows with missing target
train_data = train_data.dropna(subset=['age_group'])

# Map target to binary
train_data['age_group'] = train_data['age_group'].map({'Adult': 0, 'Senior': 1})

# Define features
features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
categorical_features = ['RIAGENDR', 'PAQ605', 'DIQ010']
continuous_features = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

In [None]:
# Class-wise imputation for training data
for col in categorical_features:
    for group in [0, 1]:
        mode_val = train_data.loc[train_data['age_group'] == group, col].mode()[0]
        mask = (train_data['age_group'] == group) & (train_data[col].isnull())
        train_data.loc[mask, col] = mode_val

for col in continuous_features:
    for group in [0, 1]:
        median_val = train_data.loc[train_data['age_group'] == group, col].median()
        mask = (train_data['age_group'] == group) & (train_data[col].isnull())
        train_data.loc[mask, col] = median_val

# Global imputation for test data (since we don't know the class)
for col in categorical_features:
    mode_val = train_data[col].mode()[0]
    test_data[col] = test_data[col].fillna(mode_val)
for col in continuous_features:
    median_val = train_data[col].median()
    test_data[col] = test_data[col].fillna(median_val)

In [None]:
# Prepare train and validation split
X = train_data[features]
y = train_data['age_group']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_data[features])

# First, undersample majority class (adults)
rus = RandomUnderSampler(sampling_strategy=0.8, random_state=42)
X_res, y_res = rus.fit_resample(X_train_scaled, y_train)

# Then, oversample minority class (seniors)
smote = SMOTE(sampling_strategy=1.0, random_state=42)
X_bal, y_bal = smote.fit_resample(X_res, y_res)

# XGBoost: set scale_pos_weight to handle imbalance (optional, since data is balanced)
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='auc', random_state=42)
xgb_clf.fit(X_bal, y_bal)

probs = xgb_clf.predict_proba(X_val_scaled)[:, 1]
val_preds_custom = (probs > 0.8).astype(int)
from sklearn.metrics import classification_report
print(classification_report(y_val, val_preds_custom, target_names=['Adult', 'Senior']))



              precision    recall  f1-score   support

       Adult       0.88      0.83      0.85       492
      Senior       0.32      0.43      0.36        94

    accuracy                           0.76       586
   macro avg       0.60      0.63      0.61       586
weighted avg       0.79      0.76      0.77       586



Parameters: { "use_label_encoder" } are not used.



In [None]:
# Predict probabilities for the test set (not validation set)
test_probs = xgb_clf.predict_proba(X_test_scaled)[:, 1]

# Choose your threshold (e.g., 0.8 as in your example)
threshold = 0.8
test_preds = (test_probs > threshold).astype(int)

# If your test_data has an ID column like 'SEQN', include it; otherwise, just use the predictions
if 'SEQN' in test_data.columns:
    submission = pd.DataFrame({
        'age_group': test_preds
    })
else:
    submission = pd.DataFrame({'age_group': test_preds})

submission.to_csv('submission_xgb.csv', index=False)
print("Submission file 'submission_xgb.csv' created!")

Submission file 'submission_xgb.csv' created!
