In [230]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [231]:
df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Train_Data.csv")
df.drop(columns=['SEQN'], inplace=True)
df.head(5)

Unnamed: 0,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN,age_group
0,2.0,2.0,35.7,110.0,2.0,150.0,14.91,Adult
1,2.0,2.0,20.3,89.0,2.0,80.0,3.85,Adult
2,1.0,2.0,23.2,89.0,2.0,68.0,6.14,Adult
3,1.0,2.0,28.9,104.0,,84.0,16.15,Adult
4,2.0,1.0,35.9,103.0,2.0,81.0,10.92,Adult


In [232]:
print(df['PAQ605'].unique())
print(df['DIQ010'].unique())
print(df['age_group'].unique())

[ 2.  1. nan  7.]
[ 2. nan  1.  3.]
['Adult' 'Senior' nan]


In [233]:
df.isnull().sum()

RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group    14
dtype: int64

In [234]:
df = df.dropna(subset=['age_group'])
df.isnull().sum()

RIAGENDR     18
PAQ605       13
BMXBMI       18
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group     0
dtype: int64

In [235]:
adult_mean = df.loc[df['age_group'] == 'Adult', 'PAQ605'].mean()
senior_mean = df.loc[df['age_group'] == 'Senior', 'PAQ605'].mean()

df.loc[(df['age_group'] == 'Adult') & (df['PAQ605'].isna()), 'PAQ605'] = adult_mean
df.loc[(df['age_group'] == 'Senior') & (df['PAQ605'].isna()), 'PAQ605'] = senior_mean


adult_mean = df.loc[df['age_group'] == 'Adult', 'BMXBMI'].mean()
senior_mean = df.loc[df['age_group'] == 'Senior', 'BMXBMI'].mean()

df.loc[(df['age_group'] == 'Adult') & (df['BMXBMI'].isna()), 'BMXBMI'] = adult_mean
df.loc[(df['age_group'] == 'Senior') & (df['BMXBMI'].isna()), 'BMXBMI'] = senior_mean


df.isnull().sum()

RIAGENDR     18
PAQ605        0
BMXBMI        0
LBXGLU       13
DIQ010       18
LBXGLT       11
LBXIN         9
age_group     0
dtype: int64

In [236]:
from sklearn.preprocessing import LabelEncoder

le_age = LabelEncoder()
df['age_group'] = le_age.fit_transform(df['age_group'])

df['age_group'].unique()

array([0, 1])

In [237]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import pandas as pd

def model_impute(df, target_col, feature_cols):
    known = df[df[target_col].notna()]
    unknown = df[df[target_col].isna()]
    
    X_train = known[feature_cols].dropna()
    y_train = known.loc[X_train.index, target_col]
    
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    
    X_pred = unknown[feature_cols].dropna()
    y_pred = rf.predict(X_pred)
    
    df.loc[X_pred.index, target_col] = y_pred
    return df

In [238]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pandas as pd

def model_impute_regression(df, target_col, feature_cols):
    known = df[df[target_col].notna()]
    unknown = df[df[target_col].isna()]
    
    X_train = known[feature_cols].dropna()
    y_train = known.loc[X_train.index, target_col]

    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)

    X_pred = unknown[feature_cols].dropna()
    y_pred = rf.predict(X_pred)

    df.loc[X_pred.index, target_col] = y_pred

    return df

In [239]:
df = model_impute(df, 'RIAGENDR', ['BMXBMI', 'age_group', 'PAQ605'])
df = model_impute_regression(df, 'LBXIN', ['BMXBMI', 'age_group', 'PAQ605', 'RIAGENDR'])
df = model_impute_regression(df, 'LBXGLU', ['BMXBMI', 'age_group', 'PAQ605', 'RIAGENDR', 'LBXIN'])
df = model_impute_regression(df, 'LBXGLT', ['BMXBMI', 'age_group', 'PAQ605', 'RIAGENDR', 'LBXIN', 'LBXGLU'])
df = model_impute(df, 'DIQ010', ['BMXBMI', 'age_group', 'PAQ605', 'RIAGENDR', 'LBXIN', 'LBXGLU', 'LBXGLT'])
df.isnull().sum()

RIAGENDR     0
PAQ605       0
BMXBMI       0
LBXGLU       0
DIQ010       0
LBXGLT       0
LBXIN        0
age_group    0
dtype: int64

In [240]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df.drop(columns=['age_group'])
y = df['age_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [241]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

X = df.drop(columns=['age_group'])
y = df['age_group']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [2, 3, 4],
    'subsample': [0.8, 1.0]
}

model = GradientBoostingClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='f1_weighted',  # or use 'f1_macro' if you care about both classes equally
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

In [242]:
print(classification_report(
    y_test,
    y_pred,
    labels=list(range(len(label_encoder.classes_))),
    target_names=label_encoder.classes_
))

              precision    recall  f1-score   support

       Adult       0.89      0.97      0.93       340
      Senior       0.52      0.24      0.32        51

    accuracy                           0.87       391
   macro avg       0.71      0.60      0.63       391
weighted avg       0.85      0.87      0.85       391



In [243]:
test_df = pd.read_csv("C:\\Users\\LENOVO\\Desktop\\Test_Data.csv")
test_df.head(5)

Unnamed: 0,SEQN,RIAGENDR,PAQ605,BMXBMI,LBXGLU,DIQ010,LBXGLT,LBXIN
0,77017.0,1.0,1.0,32.2,96.0,2.0,135.0,15.11
1,75580.0,2.0,2.0,26.3,100.0,2.0,141.0,15.26
2,73820.0,1.0,2.0,28.6,107.0,2.0,136.0,8.82
3,80489.0,2.0,1.0,22.1,93.0,2.0,111.0,12.13
4,82047.0,1.0,1.0,24.7,91.0,2.0,105.0,3.12


In [244]:
test_df.isnull().sum()

SEQN        2
RIAGENDR    2
PAQ605      1
BMXBMI      1
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

In [245]:
test_df['BMXBMI'] = test_df['BMXBMI'].fillna(test_df['BMXBMI'].mean())
test_df['PAQ605'] = test_df['PAQ605'].fillna(test_df['PAQ605'].mean())
test_df.isnull().sum()

SEQN        2
RIAGENDR    2
PAQ605      0
BMXBMI      0
LBXGLU      1
DIQ010      1
LBXGLT      2
LBXIN       1
dtype: int64

In [246]:
test_df = model_impute(test_df, 'RIAGENDR', ['BMXBMI', 'PAQ605'])
test_df = model_impute_regression(test_df, 'LBXIN', ['BMXBMI', 'PAQ605', 'RIAGENDR'])
test_df = model_impute_regression(test_df, 'LBXGLU', ['BMXBMI', 'PAQ605', 'RIAGENDR', 'LBXIN'])
test_df = model_impute_regression(test_df, 'LBXGLT', ['BMXBMI', 'PAQ605', 'RIAGENDR', 'LBXIN', 'LBXGLU'])
test_df = model_impute(test_df, 'DIQ010', ['BMXBMI', 'PAQ605', 'RIAGENDR', 'LBXIN', 'LBXGLU', 'LBXGLT'])
test_df.isnull().sum()

SEQN        2
RIAGENDR    0
PAQ605      0
BMXBMI      0
LBXGLU      0
DIQ010      0
LBXGLT      0
LBXIN       0
dtype: int64

In [247]:
SEQN=test_df['SEQN']
test_df.drop(['SEQN'],axis=1,inplace=True)

In [248]:
y_pred = best_model.predict(test_df)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [251]:
result = pd.DataFrame({
    'age_group': y_pred
})

In [252]:
result.to_csv("NidhiSummerAnalyticsWeek4Predictions.csv", index=False)