<a href="https://colab.research.google.com/github/lucasmark07/age-prediction/blob/main/Age_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
#!/usr/bin/env python3
"""
NHANES Age Prediction: Senior vs. Adult
This script builds a machine learning model to predict whether an individual
from the NHANES dataset is a 'Senior' (65+) or 'Adult' (under 65).
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                           f1_score, roc_auc_score, confusion_matrix,
                           ConfusionMatrixDisplay, RocCurveDisplay)
import joblib
import sys
import os

def load_data():
    """Load the training data and perform initial exploration."""
    print("=" * 60)
    print("1. DATA LOADING AND INITIAL EXPLORATION")
    print("=" * 60)

    try:
        train_df = pd.read_csv("Train_Data.csv")
        print("✓ Training data loaded successfully")
    except FileNotFoundError:
        print("Error: Train_Data.csv not found. Please ensure it's in the same directory.")
        sys.exit(1)

    print(f'Shape of the training dataset: {train_df.shape}')
    print('\nFirst 5 rows of the training dataset:')
    print(train_df.head())
    print('\nInformation about the training dataset:')
    print(train_df.info())
    print('\nDescriptive statistics of the training dataset:')
    print(train_df.describe())
    print('\nMissing values in each column of the training dataset:')
    print(train_df.isnull().sum())

    return train_df

def preprocess_data(df, is_train=True):
    """Preprocess the data with imputation and encoding."""
    original_seq_numbers = None
    if not is_train and "SEQN" in df.columns:
        original_seq_numbers = df["SEQN"]

    categorical_features = ["RIAGENDR", "PAQ605", "DIQ010"]

    numerical_cols = df.select_dtypes(include=["float64", "int64"]).columns.tolist()
    if "SEQN" in numerical_cols:
        numerical_cols.remove("SEQN")

    # Impute numerical features
    imputer_numerical = SimpleImputer(strategy='mean')
    df[numerical_cols] = imputer_numerical.fit_transform(df[numerical_cols])

    # Impute categorical features
    for col in categorical_features:
        if col in df.columns:
            imputer_categorical = SimpleImputer(strategy='most_frequent')
            df[col] = imputer_categorical.fit_transform(df[[col]])
            df[col] = df[col].astype(int).astype(str)

    # Create target variable for training data
    if is_train:
        df["is_senior"] = (df["RIDAGEYR"] >= 65).astype(int)
        if 'age_group' in df.columns:
            df.drop(columns=['age_group'], inplace=True)

    # One-hot encoding
    df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

    # Remove SEQN column
    if "SEQN" in df.columns:
        df.drop(columns=["SEQN"], inplace=True)

    if not is_train:
        return df, original_seq_numbers
    return df

def train_model(processed_train_df):
    """Train XGBoost model with hyperparameter tuning."""
    print("\n" + "=" * 60)
    print("3. MODEL TRAINING AND EVALUATION")
    print("=" * 60)

    X = processed_train_df.drop(columns=["is_senior"])
    y = processed_train_df["is_senior"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Initialize XGBoost model
    model = XGBClassifier(random_state=42, eval_metric='logloss')

    # Define parameter grid
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.7, 0.8, 0.9]
    }

    print("Starting hyperparameter tuning...")
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=5,
        scoring='accuracy', n_jobs=-1, verbose=1
    )
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Make predictions
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)

    print(f"\nBest Hyperparameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")

    # Save model
    joblib.dump(best_model, 'xgboost_model.pkl')
    print("Model saved as xgboost_model.pkl")

    return best_model, X_test, y_test, y_pred

def visualize_results(best_model, X_test, y_test, y_pred):
    """Create visualizations for model performance."""
    print("\n" + "=" * 60)
    print("4. RESULTS ANALYSIS AND VISUALIZATION")
    print("=" * 60)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp_cm = ConfusionMatrixDisplay(
        confusion_matrix=cm, display_labels=["Not Senior", "Senior"]
    )
    disp_cm.plot()
    plt.title("Confusion Matrix - XGBoost")
    plt.savefig("confusion_matrix_xgb.png")
    plt.close()
    print("Saved confusion_matrix_xgb.png")

    # ROC Curve
    disp_roc = RocCurveDisplay.from_estimator(best_model, X_test, y_test)
    plt.title("ROC Curve - XGBoost")
    plt.savefig("roc_curve_xgb.png")
    plt.close()
    print("Saved roc_curve_xgb.png")

def generate_submission(best_model, processed_train_df):
    """Generate submission file with predictions."""
    print("\n" + "=" * 60)
    print("5. GENERATE SUBMISSION FILE")
    print("=" * 60)

    try:
        test_df = pd.read_csv("Test_Data.csv")
        print("✓ Test data loaded successfully")
    except FileNotFoundError:
        print("Error: Test_Data.csv not found. Please ensure it's in the same directory.")
        sys.exit(1)

    processed_test_df, original_test_seq_numbers = preprocess_data(
        test_df.copy(), is_train=False
    )

    # Ensure test set columns match training set columns
    train_cols = processed_train_df.drop(columns=["is_senior"]).columns
    processed_test_df = processed_test_df.reindex(columns=train_cols, fill_value=0)

    # Make predictions
    predictions = best_model.predict(processed_test_df)

    # Create submission file
    submission_df = pd.DataFrame({
        "SEQN": original_test_seq_numbers,
        "age_group": predictions
    })
    submission_df.to_csv("submission.csv", index=False)
    print("✓ Submission file generated as submission.csv")

    return submission_df

def main():
    """Main execution function."""
    print("NHANES Age Classification Project")
    print("Predicting Senior (65+) vs Adult (<65) categories")
    print("="*50)
    # Load and explore data
    train_df = load_data()

    # Preprocess data
    print("\n" + "=" * 60)
    print("2. DATA PREPROCESSING AND FEATURE ENGINEERING")
    print("=" * 60)

    processed_train_df = preprocess_data(train_df.copy(), is_train=True)
    processed_train_df.to_csv("processed_train_data.csv", index=False)
    print("✓ Data preprocessing completed")
    print(f"Shape of processed training dataset: {processed_train_df.shape}")
    print("\nFirst 5 rows of processed training dataset:")
    print(processed_train_df.head())

    # Train model
    best_model, X_test, y_test, y_pred = train_model(processed_train_df)

    # Visualize results
    visualize_results(best_model, X_test, y_test, y_pred)

    # Generate submission
    submission_df = generate_submission(best_model, processed_train_df)

    print("\n" + "=" * 50)
    print("RESULTS")
    print("=" * 50)
    print("- Model training completed")
    print("- Used XGBoost with grid search for hyperparameters")
    print("- Handled missing data with mean/mode imputation")
    print("- Generated confusion matrix and ROC curve plots")
    print("- Created submission.csv file")
    print("\nDone! Check the output files.")

if __name__ == "__main__":
    main()

NHANES Age Classification Project
Predicting Senior (65+) vs Adult (<65) categories
1. DATA LOADING AND INITIAL EXPLORATION
✓ Training data loaded successfully
Shape of the training dataset: (1966, 10)

First 5 rows of the training dataset:
      SEQN  RIDAGEYR  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN  \
0  73564.0      61.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91   
1  73568.0      26.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85   
2  73576.0      16.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14   
3  73577.0      32.0       1.0     2.0    28.9   104.0     NaN    84.0  16.15   
4  73580.0      38.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92   

  age_group  
0     Adult  
1     Adult  
2     Adult  
3     Adult  
4     Adult  

Information about the training dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1966 entries, 0 to 1965
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
