In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [81]:
# Load the dataset
df = pd.read_csv('health.data.cleaned.csv')

# Rename columns for easier access
df.rename(columns={
    'Height (cm)': 'Height',
    'Weight (kg)': 'Weight',
    'Blood Pressure (s/d)': 'Blood_Pressure',
    'Cholesterol Level (mg/dL)': 'Cholesterol',
    'Blood Glucose Level (mg/dL)': 'Blood_Glucose',
    'Bone Density (g/cm²)': 'Bone_Density',
    'Vision Sharpness': 'Vision',
    'Hearing Ability (dB)': 'Hearing',
    'Physical Activity Level': 'Physical_Activity',
    'Smoking Status': 'Smoking',
    'Alcohol Consumption': 'Alcohol',
    'Chronic Diseases': 'Chronic_Diseases',
    'Medication Use': 'Medication',
    'Family History': 'Family_History',
    'Cognitive Function': 'Cognitive_Function',
    'Mental Health Status': 'Mental_Health',
    'Sleep Patterns': 'Sleep',
    'Stress Levels': 'Stress',
    'Pollution Exposure': 'Pollution',
    'Sun Exposure': 'Sun_Exposure',
    'Education Level': 'Education',
    'Income Level': 'Income',
    'Age (years)': 'Age'
}, inplace=True)

In [82]:
# Map Mental_Health to numeric values
mental_health_mapping = {'Poor': 0, 'Fair': 1, 'Good': 2, 'Excellent': 3}
df['Mental_Health'] = df['Mental_Health'].map(mental_health_mapping)

# Fill NaN values with mode (most frequent value)
df['Mental_Health'].fillna(df['Mental_Health'].mode()[0], inplace=True)

# Confirm mapping worked
print("Unique values in Mental_Health after mapping:", df['Mental_Health'].unique())
print("Missing values in Mental_Health after mapping:", df['Mental_Health'].isnull().sum())

Unique values in Mental_Health after mapping: [2 0 1 3]
Missing values in Mental_Health after mapping: 0


In [83]:
for column in df.columns:
    if df[column].dtype == 'object':  # If categorical
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:  # If numerical
        df[column].fillna(df[column].mean(), inplace=True)

In [84]:
print(df.isnull().sum().sum())  # Should print 0

0


In [85]:
# Fix Blood Pressure Column (Split into two numeric columns)
df[['Systolic_BP', 'Diastolic_BP']] = df['Blood_Pressure'].str.split('/', expand=True)

# Convert to numeric values
df['Systolic_BP'] = pd.to_numeric(df['Systolic_BP'])
df['Diastolic_BP'] = pd.to_numeric(df['Diastolic_BP'])

# Drop the original Blood_Pressure column
df.drop(columns=['Blood_Pressure'], inplace=True)

In [86]:
df.columns

Index(['Gender', 'Height', 'Weight', 'Cholesterol', 'BMI', 'Blood_Glucose',
       'Bone_Density', 'Vision', 'Hearing', 'Physical_Activity', 'Smoking',
       'Alcohol', 'Diet', 'Chronic_Diseases', 'Medication', 'Family_History',
       'Cognitive_Function', 'Mental_Health', 'Sleep', 'Stress', 'Pollution',
       'Sun_Exposure', 'Education', 'Income', 'Age', 'Systolic_BP',
       'Diastolic_BP'],
      dtype='object')

In [87]:
print(df.dtypes)

Gender                 object
Height                float64
Weight                float64
Cholesterol           float64
BMI                   float64
Blood_Glucose         float64
Bone_Density          float64
Vision                float64
Hearing               float64
Physical_Activity      object
Smoking                object
Alcohol                object
Diet                   object
Chronic_Diseases       object
Medication             object
Family_History         object
Cognitive_Function    float64
Mental_Health           int64
Sleep                  object
Stress                float64
Pollution             float64
Sun_Exposure          float64
Education              object
Income                 object
Age                     int64
Systolic_BP             int64
Diastolic_BP            int64
dtype: object


In [88]:
# Label encode ordinal variables
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})  # Convert Gender to numeric

# One-hot encode non-ordinal categorical variables
df = pd.get_dummies(df, columns=['Physical_Activity', 'Smoking', 'Alcohol', 'Diet', 
                                 'Chronic_Diseases', 'Medication', 'Family_History', 
                                 'Sleep', 'Education', 'Income'], drop_first=True)

# Confirm all columns are now numeric
print(df.dtypes)

Gender                              int64
Height                            float64
Weight                            float64
Cholesterol                       float64
BMI                               float64
Blood_Glucose                     float64
Bone_Density                      float64
Vision                            float64
Hearing                           float64
Cognitive_Function                float64
Mental_Health                       int64
Stress                            float64
Pollution                         float64
Sun_Exposure                      float64
Age                                 int64
Systolic_BP                         int64
Diastolic_BP                        int64
Physical_Activity_Low                bool
Physical_Activity_Moderate           bool
Smoking_Former                       bool
Smoking_Never                        bool
Alcohol_Occasional                   bool
Diet_High-fat                        bool
Diet_Low-carb                     

In [92]:
# Check correlation of features with Age
correlations = df.corr()['Age'].drop('Age').sort_values(ascending=False)
print(correlations)

Hearing                           0.712374
Systolic_BP                       0.646056
Diastolic_BP                      0.611071
Cholesterol                       0.432368
Blood_Glucose                     0.428573
Smoking_Former                    0.091012
Education_Postgraduate            0.038378
Alcohol_Occasional                0.032695
Stress                            0.029093
Height                            0.020322
Diet_High-fat                     0.019952
Physical_Activity_Low             0.016115
Chronic_Diseases_Hypertension     0.015748
Sleep_Normal                      0.009994
Sun_Exposure                      0.009245
Family_History_Hypertension       0.003717
Weight                            0.002521
Diet_Vegetarian                   0.000372
Medication_Regular               -0.000041
Family_History_Heart Disease     -0.000770
Chronic_Diseases_Heart Disease   -0.001419
Education_Undergraduate          -0.002912
Diet_Low-carb                    -0.003023
Mental_Heal

## Single Regression Model (Using Standardized Hearing)

In [96]:
# Define independent (X) and dependent (y) variable
X_single = df[['Hearing']]
y = df['Age']

In [97]:
# Standardize Hearing
scaler = StandardScaler()
X_single_scaled = scaler.fit_transform(X_single)

# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_single_scaled, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr_single = LinearRegression()
lr_single.fit(X_train, y_train)

# Make predictions
y_pred_single = lr_single.predict(X_test)

In [98]:
# Evaluate the model
mae_single = mean_absolute_error(y_test, y_pred_single)
mse_single = mean_squared_error(y_test, y_pred_single)
r2_single = r2_score(y_test, y_pred_single)

# Print performance
print(f"Single Regression Model Performance (Using Standardized Hearing):\n"
      f"MAE: {mae_single:.3f}\n"
      f"MSE: {mse_single:.3f}\n"
      f"R² Score: {r2_single:.3f}")

Single Regression Model Performance (Using Standardized Hearing):
MAE: 11.684
MSE: 205.733
R² Score: 0.498


#### The Single Linear Regression model using Hearing as the sole predictor for Age shows a Mean Absolute Error (MAE) of 11.684, meaning the model's predictions deviate from the actual ages by an average of ~11.7 years. The Mean Squared Error (MSE) of 205.733 suggests that the model struggles to minimize large errors in prediction. The R² score of 0.498 indicates that Hearing explains ~49.8% of the variance in Age, which is a moderate relationship but suggests that additional features could improve prediction accuracy. While hearing ability is a reasonable indicator of aging, incorporating more health-related variables may lead to a more robust model.

## Polynomial Regression Model (Features based on correlation: Hearing, Systolic_BP, Diastolic_BP, Cholesterol, Blood_Glucose, Cognitive_Function, Vision, Bone_Density)

In [99]:
# Select the best features based on correlation
selected_features = ['Hearing', 'Systolic_BP', 'Diastolic_BP', 'Cholesterol', 
                     'Blood_Glucose', 'Cognitive_Function', 'Vision', 'Bone_Density']

In [100]:
# Extract features and target variable
X_poly = df[selected_features]
y_poly = df['Age']

In [101]:
# Standardize the selected features
scaler = StandardScaler()
X_poly_scaled = scaler.fit_transform(X_poly)

# Apply Polynomial Features (Degree = 2)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly_transformed = poly.fit_transform(X_poly_scaled)

In [102]:
# Split into train and test sets
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly_transformed, y_poly, test_size=0.2, random_state=42)

# Train the Polynomial Regression Model
poly_reg = LinearRegression()
poly_reg.fit(X_train_poly, y_train_poly)

# Predictions
y_pred_poly = poly_reg.predict(X_test_poly)

In [103]:
# Evaluate the model
mae_poly = mean_absolute_error(y_test_poly, y_pred_poly)
mse_poly = mean_squared_error(y_test_poly, y_pred_poly)
r2_poly = r2_score(y_test_poly, y_pred_poly)

# Display the results
print(f"Polynomial Regression Model Performance (Degree = 2):")
print(f"MAE: {mae_poly:.3f}")
print(f"MSE: {mse_poly:.3f}")
print(f"R² Score: {r2_poly:.3f}")

Polynomial Regression Model Performance (Degree = 2):
MAE: 4.191
MSE: 28.006
R² Score: 0.932


#### The polynomial regression model with degree=2 significantly improves predictive performance compared to the single-regression model. The Mean Absolute Error (MAE) of 4.191 and Mean Squared Error (MSE) of 28.006 indicate that the model makes much smaller errors when predicting age. More importantly, the R² Score of 0.932 suggests that the model explains 93.2% of the variance in age, meaning it effectively captures the relationship between the features and the target variable. This high R² value suggests that adding quadratic terms helped uncover deeper patterns in the data, allowing for a more accurate estimation of age based on health indicators. However, it's essential to check for overfitting, as a very high R² might indicate that the model is too closely fitted to the training data.