In [1]:
# Step 1: Load and clean data
import pandas as pd

# Load files
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

# Drop ID column
train.drop(columns=["SEQN"], inplace=True)
test.drop(columns=["SEQN"], inplace=True)

# Drop rows with missing target
train = train[train['age_group'].notna()]

# Encode target: Adult = 0, Senior = 1
train['age_group'] = train['age_group'].map({'Adult': 0, 'Senior': 1})

# Display basic info
print("Train shape:", train.shape)
print("Test shape:", test.shape)

# Show % missing in each column
print("\nMissing values in Train:\n", train.isnull().mean() * 100)
print("\nMissing values in Test:\n", test.isnull().mean() * 100)

# View first few rows
print("\nSample of training data:\n", train.head())


Train shape: (1952, 8)
Test shape: (312, 7)

Missing values in Train:
 RIAGENDR     0.922131
PAQ605       0.665984
BMXBMI       0.922131
LBXGLU       0.665984
DIQ010       0.922131
LBXGLT       0.563525
LBXIN        0.461066
age_group    0.000000
dtype: float64

Missing values in Test:
 RIAGENDR    0.641026
PAQ605      0.320513
BMXBMI      0.320513
LBXGLU      0.320513
DIQ010      0.320513
LBXGLT      0.641026
LBXIN       0.320513
dtype: float64

Sample of training data:
    RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN  age_group
0       2.0     2.0    35.7   110.0     2.0   150.0  14.91          0
1       2.0     2.0    20.3    89.0     2.0    80.0   3.85          0
2       1.0     2.0    23.2    89.0     2.0    68.0   6.14          0
3       1.0     2.0    28.9   104.0     NaN    84.0  16.15          0
4       2.0     1.0    35.9   103.0     2.0    81.0  10.92          0


In [3]:
# Step 2: Fill missing values using mode or median

# Categorical columns → use MODE
cat_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
for col in cat_cols:
    mode_val = train[col].mode()[0]
    train[col].fillna(mode_val, inplace=True)
    test[col].fillna(mode_val, inplace=True)

# Numerical columns → use MEDIAN
num_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
for col in num_cols:
    median_val = train[col].median()
    train[col].fillna(median_val, inplace=True)
    test[col].fillna(median_val, inplace=True)

# Confirm
print("✅ Missing values filled.")
print("Train missing:\n", train.isnull().sum())



✅ Missing values filled.
Train missing:
 RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(mode_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beh

In [5]:
# Step 3: Train-test split, model building, evaluation
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# Separate input and target
X = train.drop(columns=["age_group"])
y = train["age_group"]

# Train-test split (for validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize models
logreg = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Train & predict
models = {
    "Logistic Regression": logreg,
    "Random Forest": rf,
    "XGBoost": xgb
}

# Evaluate models using F1 Score
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    print(f"{name} F1 Score: {f1:.4f}")


Logistic Regression F1 Score: 0.1067
Random Forest F1 Score: 0.1882
XGBoost F1 Score: 0.2626


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [6]:
def engineer_features(df):
    # Base features
    df['GLU_BMI_RATIO'] = df['LBXGLU'] / (df['BMXBMI'] + 1e-3)
    df['INSULIN_FLAG'] = (df['LBXIN'] > 20).astype(int)
    df['ACTIVE_FLAG'] = (df['PAQ605'] == 1).astype(int)
    df['BMI_GLUC_INTERACT'] = df['BMXBMI'] * df['LBXGLU']

    # Advanced features
    df['BMI_GLUC_DIFF'] = df['BMXBMI'] - df['LBXGLU']
    df['BMI_X_GLU'] = df['BMXBMI'] * df['LBXGLU']
    df['BMI_X_INSULIN'] = df['BMXBMI'] * df['LBXIN']
    df['AGE_LIKE_SCORE'] = df['LBXGLU'] + df['LBXIN'] - df['BMXBMI']
    df['DIABETIC_FLAG'] = (df['DIQ010'] == 1).astype(int)
    df['INACTIVE_RISK'] = ((df['PAQ605'] != 1) & (df['BMXBMI'] > 28)).astype(int)

    # Binning
    df['BMI_BIN'] = pd.cut(df['BMXBMI'], bins=[0, 18.5, 25, 30, 100], labels=[0, 1, 2, 3]).astype(float)
    df['GLU_BIN'] = pd.cut(df['LBXGLU'], bins=[0, 90, 110, 126, 300], labels=[0, 1, 2, 3]).astype(float)
    
    return df

X_train = engineer_features(X_train)
X_val = engineer_features(X_val)
test = engineer_features(test)



In [7]:
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split
import numpy as np

# Step 1: Split your dataset if not already done
# X, y = ... (your original data before splitting)
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Impute missing values in training and validation sets
imputer = SimpleImputer(strategy='median')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_val_imputed = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Step 3: Apply SMOTE on imputed data
sm = SMOTE(random_state=42)
X_train_bal, y_train_bal = sm.fit_resample(X_train_imputed, y_train)

# Step 4: Train XGBoost
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train_bal, y_train_bal)

# Step 5: Get validation probabilities
val_probs = xgb_model.predict_proba(X_val_imputed)[:, 1]

# Step 6: Tune threshold
prec, rec, thresh = precision_recall_curve(y_val, val_probs)
f1_scores = 2 * (prec * rec) / (prec + rec + 1e-8)
best_thresh = thresh[np.argmax(f1_scores)]

# Step 7: Final F1 on val set
val_preds_thresh = (val_probs >= best_thresh).astype(int)
f1 = f1_score(y_val, val_preds_thresh)

print("✅ Best threshold:", best_thresh)
print("✅ F1 with tuned XGBoost:", round(f1 * 100, 2))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Best threshold: 0.17183354
✅ F1 with tuned XGBoost: 36.87


In [8]:
from sklearn.model_selection import GridSearchCV

# Grid of hyperparameters
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Base model
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Grid search with F1 scoring
grid_search = GridSearchCV(
    estimator=xgb_clf,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit
grid_search.fit(X_train, y_train)

# Best model
best_xgb = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = best_xgb.predict(X_val)
f1 = f1_score(y_val, y_val_pred)

print("✅ Best XGBoost Params:", grid_search.best_params_)
print("✅ Tuned XGBoost F1 Score:", f1)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


✅ Best XGBoost Params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 50, 'subsample': 0.8}
✅ Tuned XGBoost F1 Score: 0.21951219512195122


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [11]:
# 1. Combine train + val
X_full = pd.concat([X_train, X_val], axis=0)
y_full = pd.concat([y_train, y_val], axis=0)

# 2. Feature engineering
X_full = engineer_features(X_full)

# 3. Handle missing values using median imputation
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_full_imputed = pd.DataFrame(imputer.fit_transform(X_full), columns=X_full.columns)

# 4. Apply SMOTE on the imputed version
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_full_bal, y_full_bal = sm.fit_resample(X_full_imputed, y_full)

# 5. Train XGBoost
from xgboost import XGBClassifier

xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_full_bal, y_full_bal)

# 6. Predict on test set
# ✅ Make sure test also has engineered features + imputed
test = engineer_features(test)
test_imputed = pd.DataFrame(imputer.transform(test), columns=test.columns)

# 7. Predict and apply threshold
test_probs = xgb_model.predict_proba(test_imputed)[:, 1]
test_preds = (test_probs >= 0.5).astype(int)

# 8. Submission
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv('final_submission7.csv', index=False)
print("✅ Submission saved: final_submission.csv")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Submission saved: final_submission.csv


In [12]:
y = y.fillna(0).astype(int)  # or use mode if 0 isn't safe


In [14]:
print(y.value_counts())  # Confirm only 0 and 1


age_group
0    1638
1     314
Name: count, dtype: int64
