In [18]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')



In [19]:
pd.set_option('display.max_columns', None)

In [20]:
train_df = pd.read_csv('Train.csv')
test_df = pd.read_csv('Test.csv')

In [21]:
train_df.head()

Unnamed: 0,Gender,Height (cm),Weight (kg),Blood Pressure (s/d),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),Physical Activity Level,Smoking Status,Alcohol Consumption,Diet,Chronic Diseases,Medication Use,Family History,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Age (years)
0,Male,171.148359,86.185197,151/109,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,Moderate,Former,,Low-carb,,,,44.059172,Good,Insomnia,2.797064,5.142344,7.108975,,Medium,89
1,Male,172.946206,79.641937,134/112,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,Low,Current,Occasional,Balanced,Hypertension,,Heart Disease,45.312298,Good,Normal,9.33993,7.27272,3.918489,Undergraduate,Medium,77
2,Female,155.945488,49.167058,160/101,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,Moderate,Never,,Balanced,Hypertension,Regular,Hypertension,56.246991,Poor,Insomnia,9.234637,8.500386,5.393408,,Medium,70
3,Female,169.078298,56.017921,133/94,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,Moderate,Never,,Balanced,Diabetes,Occasional,Hypertension,55.196092,Poor,Insomnia,4.693446,7.555511,2.745578,,Low,52
4,Female,163.758355,73.966304,170/106,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,Low,Former,Frequent,Vegetarian,,,,53.023379,Good,Normal,4.038537,9.429097,3.878435,Undergraduate,High,79


In [22]:
train_df.isna().sum()

Gender                            0
Height (cm)                       0
Weight (kg)                       0
Blood Pressure (s/d)              0
Cholesterol Level (mg/dL)         0
BMI                               0
Blood Glucose Level (mg/dL)       0
Bone Density (g/cm²)              0
Vision Sharpness                  0
Hearing Ability (dB)              0
Physical Activity Level           0
Smoking Status                    0
Alcohol Consumption            1201
Diet                              0
Chronic Diseases               1299
Medication Use                 1198
Family History                 1451
Cognitive Function                0
Mental Health Status              0
Sleep Patterns                    0
Stress Levels                     0
Pollution Exposure                0
Sun Exposure                      0
Education Level                 627
Income Level                      0
Age (years)                       0
dtype: int64

In [23]:
test_df.isna().sum()

Gender                            0
Height (cm)                       0
Weight (kg)                       0
Blood Pressure (s/d)              0
Cholesterol Level (mg/dL)         0
BMI                               0
Blood Glucose Level (mg/dL)       0
Bone Density (g/cm²)              0
Vision Sharpness                  0
Hearing Ability (dB)              0
Physical Activity Level           0
Smoking Status                    0
Alcohol Consumption            1201
Diet                              0
Chronic Diseases               1299
Medication Use                 1198
Family History                 1451
Cognitive Function                0
Mental Health Status              0
Sleep Patterns                    0
Stress Levels                     0
Pollution Exposure                0
Sun Exposure                      0
Education Level                 627
Income Level                      0
dtype: int64

In [24]:
train_df_zeros = train_df.fillna(value='-')
test_df_zeros = test_df.fillna(value='-')

In [25]:
# Split the "Blood Pressure (s/d)" column into two separate columns
train_df_zeros[['Systolic_BP', 'Diastolic_BP']] = train_df_zeros['Blood Pressure (s/d)'].str.split('/', expand=True)
test_df_zeros[['Systolic_BP', 'Diastolic_BP']] = test_df_zeros['Blood Pressure (s/d)'].str.split('/', expand=True)
# Convert the new columns to numeric values
train_df_zeros['Systolic_BP'] = pd.to_numeric(train_df_zeros['Systolic_BP'])
train_df_zeros['Diastolic_BP'] = pd.to_numeric(train_df_zeros['Diastolic_BP'])

test_df_zeros['Systolic_BP'] = pd.to_numeric(test_df_zeros['Systolic_BP'])
test_df_zeros['Diastolic_BP'] = pd.to_numeric(test_df_zeros['Diastolic_BP'])

# Drop the original "Blood Pressure (s/d)" column if it's no longer needed
train_df_zeros.drop(columns=['Blood Pressure (s/d)'], inplace=True)
test_df_zeros.drop(columns=['Blood Pressure (s/d)'], inplace=True)

In [26]:
train_df_zeros.head()

Unnamed: 0,Gender,Height (cm),Weight (kg),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),Physical Activity Level,Smoking Status,Alcohol Consumption,Diet,Chronic Diseases,Medication Use,Family History,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Age (years),Systolic_BP,Diastolic_BP
0,Male,171.148359,86.185197,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,Moderate,Former,-,Low-carb,-,-,-,44.059172,Good,Insomnia,2.797064,5.142344,7.108975,-,Medium,89,151,109
1,Male,172.946206,79.641937,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,Low,Current,Occasional,Balanced,Hypertension,-,Heart Disease,45.312298,Good,Normal,9.33993,7.27272,3.918489,Undergraduate,Medium,77,134,112
2,Female,155.945488,49.167058,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,Moderate,Never,-,Balanced,Hypertension,Regular,Hypertension,56.246991,Poor,Insomnia,9.234637,8.500386,5.393408,-,Medium,70,160,101
3,Female,169.078298,56.017921,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,Moderate,Never,-,Balanced,Diabetes,Occasional,Hypertension,55.196092,Poor,Insomnia,4.693446,7.555511,2.745578,-,Low,52,133,94
4,Female,163.758355,73.966304,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,Low,Former,Frequent,Vegetarian,-,-,-,53.023379,Good,Normal,4.038537,9.429097,3.878435,Undergraduate,High,79,170,106


In [27]:
train_df_zeros['Physical Activity Level'].value_counts()

Physical Activity Level
Moderate    1407
Low          902
High         691
Name: count, dtype: int64

In [28]:
columns_to_map = ['Gender','Physical Activity Level','Smoking Status','Alcohol Consumption','Diet','Chronic Diseases','Medication Use','Family History','Mental Health Status','Sleep Patterns','Education Level','Income Level']

# Probably need to fix
mappings = {
    'Male':0,'Female':1,
    'Low':0, 'Moderate':1, 'High':2,
    'Never':0,'Former':1,'Current':2,
    '-':0,'Occasional':1,'Frequent':2,
    'Balanced':0,'Vegetarian':1,'Low-carb':2,'High-fat':2,
    '-':0,'Hypertension':1,'Diabetes':2,'Heart Disease':3,
    '-':0,'Regular':1,'Occasional':2,
    '-':0,'Hypertension':1,'Diabetes':2,'Heart Disease':3,
    'Poor':0,'Fair':1,'Good':2,'Excellent':3,
    'Normal':0,'Insomnia':1,'Excessive':2,
    '-':0,'High School':1,'Undergraduate':2,'Postgraduate':3,
    'Low':0,'Medium':1,'High':2
}

for col, mapping in zip(columns_to_map, mappings):
    train_df_zeros[col] = train_df_zeros[col].map(mappings)
    test_df_zeros[col] = test_df_zeros[col].map(mappings)

In [29]:
train_df_zeros.head()


Unnamed: 0,Gender,Height (cm),Weight (kg),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),Physical Activity Level,Smoking Status,Alcohol Consumption,Diet,Chronic Diseases,Medication Use,Family History,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Age (years),Systolic_BP,Diastolic_BP
0,0,171.148359,86.185197,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,1,1,0,2,0,0,0,44.059172,2,1,2.797064,5.142344,7.108975,0,1,89,151,109
1,0,172.946206,79.641937,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,0,2,2,0,1,0,3,45.312298,2,0,9.33993,7.27272,3.918489,2,1,77,134,112
2,1,155.945488,49.167058,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,1,0,0,0,1,1,1,56.246991,0,1,9.234637,8.500386,5.393408,0,1,70,160,101
3,1,169.078298,56.017921,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,1,0,0,0,2,2,1,55.196092,0,1,4.693446,7.555511,2.745578,0,0,52,133,94
4,1,163.758355,73.966304,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,0,1,2,1,0,0,0,53.023379,2,0,4.038537,9.429097,3.878435,2,2,79,170,106


In [30]:
test_df_zeros.head()

Unnamed: 0,Gender,Height (cm),Weight (kg),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),Physical Activity Level,Smoking Status,Alcohol Consumption,Diet,Chronic Diseases,Medication Use,Family History,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Systolic_BP,Diastolic_BP
0,0,171.148359,86.185197,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,1,1,0,2,0,0,0,44.059172,2,1,2.797064,5.142344,7.108975,0,1,151,109
1,0,172.946206,79.641937,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,0,2,2,0,1,0,3,45.312298,2,0,9.33993,7.27272,3.918489,2,1,134,112
2,1,155.945488,49.167058,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,1,0,0,0,1,1,1,56.246991,0,1,9.234637,8.500386,5.393408,0,1,160,101
3,1,169.078298,56.017921,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,1,0,0,0,2,2,1,55.196092,0,1,4.693446,7.555511,2.745578,0,0,133,94
4,1,163.758355,73.966304,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,0,1,2,1,0,0,0,53.023379,2,0,4.038537,9.429097,3.878435,2,2,170,106


In [31]:
y = train_df_zeros['Age (years)']
x = train_df_zeros.drop(columns=['Age (years)'])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(x,y, train_size=0.8,random_state=10)

In [34]:
model=RandomForestClassifier(random_state=42,n_estimators=100)
model.fit(X_train, y_train)

0.07333333333333333

In [35]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Validation Mean Squared Error: {mse}')

Validation Mean Squared Error: 44.778333333333336


In [40]:
rmse = np.sqrt(mse)
rmse

6.691661477789603