In [82]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

from scipy.stats import chi2_contingency
from scipy.stats.contingency import association
import scipy.stats as stats

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [83]:
df= pd.read_csv("../Data/Raw/health_lifestyle_classification.csv")
df

Unnamed: 0,survey_code,age,gender,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,...,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target
0,1,56,Male,173.416872,56.886640,18.915925,18.915925,56.747776,18.989117,72.165130,...,High,5,Moderate,No,Yes,0,1.0,5.5,-2.275502,healthy
1,2,69,Female,163.207380,97.799859,36.716278,36.716278,110.148833,36.511417,85.598889,...,High,5,High,Yes,No,0,1.0,5.5,6.239340,healthy
2,3,46,Male,177.281966,80.687562,25.673050,25.673050,77.019151,25.587429,90.295030,...,High,4,Moderate,No,No,0,1.0,5.5,5.423737,healthy
3,4,32,Female,172.101255,63.142868,21.318480,21.318480,63.955440,21.177109,100.504211,...,High,1,,No,Yes,0,1.0,5.5,8.388611,healthy
4,5,60,Female,163.608816,40.000000,14.943302,14.943302,44.829907,14.844299,69.021150,...,High,1,High,Yes,Yes,0,1.0,5.5,0.332622,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,53,Male,177.202253,54.303671,17.293811,17.293811,51.881433,17.227616,88.740028,...,Moderate,1,High,No,Yes,0,1.0,5.5,3.477124,healthy
99996,99997,22,Male,180.802297,40.033853,12.246712,12.246712,36.740135,12.159473,103.659560,...,Moderate,5,,Yes,No,0,1.0,5.5,1.609656,healthy
99997,99998,37,Male,185.540653,84.536847,24.556580,24.556580,73.669741,24.172944,98.920422,...,Low,5,,No,No,0,1.0,5.5,-9.736463,healthy
99998,99999,72,Female,181.796786,56.923335,17.223362,17.223362,51.670087,17.715475,54.559079,...,Low,4,High,Yes,Yes,0,1.0,5.5,-4.779376,healthy


In [84]:
columns_to_drop=['survey_code','gene_marker_flag', 'income', 'bmi_estimated', 'bmi_scaled','bmi_corrected','insurance','occupation','electrolyte_level','education_level']

In [85]:
df=df.drop(columns_to_drop,axis=1)
df=df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17577 entries, 1 to 99998
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       17577 non-null  int64  
 1   gender                    17577 non-null  object 
 2   height                    17577 non-null  float64
 3   weight                    17577 non-null  float64
 4   bmi                       17577 non-null  float64
 5   waist_size                17577 non-null  float64
 6   blood_pressure            17577 non-null  float64
 7   heart_rate                17577 non-null  float64
 8   cholesterol               17577 non-null  float64
 9   glucose                   17577 non-null  float64
 10  insulin                   17577 non-null  float64
 11  sleep_hours               17577 non-null  float64
 12  sleep_quality             17577 non-null  object 
 13  work_hours                17577 non-null  float64
 14  physical_ac

In [86]:
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print("Categorical columns:", categorical_cols.tolist())

Categorical columns: ['gender', 'sleep_quality', 'alcohol_consumption', 'smoking_level', 'mental_health_support', 'job_type', 'diet_type', 'exercise_type', 'device_usage', 'healthcare_access', 'sunlight_exposure', 'caffeine_intake', 'family_history', 'pet_owner', 'target']


In [87]:
df['target'].unique()

array(['healthy', 'diseased'], dtype=object)

In [88]:
#Crosstab and Chi-square tests
#gender
crosstab_result = pd.crosstab(df['target'], df['gender'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.4714
P-value: 0.4923
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [89]:
#sleep_quality
crosstab_result = pd.crosstab(df['target'], df['sleep_quality'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 9.4999
P-value: 0.0233
Interpretation: Reject the null hypothesis — significant association found ✅


In [90]:
#alcohol_consumption
crosstab_result = pd.crosstab(df['target'], df['alcohol_consumption'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.1902
P-value: 0.6627
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [91]:
df["alcohol_consumption"].unique()

array(['Regularly', 'Occasionally'], dtype=object)

In [92]:
#smoking_level
crosstab_result = pd.crosstab(df['target'], df['smoking_level'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 3.2871
P-value: 0.1933
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [93]:
#mental_health_support
crosstab_result = pd.crosstab(df['target'], df['mental_health_support'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.5606
P-value: 0.4540
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [94]:
#job_type
crosstab_result = pd.crosstab(df['target'], df['job_type'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 13.0694
P-value: 0.0227
Interpretation: Reject the null hypothesis — significant association found ✅


In [95]:
#diet_type
crosstab_result = pd.crosstab(df['target'], df['diet_type'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 4.3133
P-value: 0.2296
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [96]:
#exercise_type
crosstab_result = pd.crosstab(df['target'], df['exercise_type'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.8141
P-value: 0.6656
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [97]:
#device_usage
crosstab_result = pd.crosstab(df['target'], df['device_usage'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 5.5859
P-value: 0.0612
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [98]:
#healthcare_access
crosstab_result = pd.crosstab(df['target'], df['healthcare_access'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 1.5980
P-value: 0.4498
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [99]:
#sunlight_exposure
crosstab_result = pd.crosstab(df['target'], df['sunlight_exposure'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.8652
P-value: 0.6488
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [100]:
#caffeine_intake
crosstab_result = pd.crosstab(df['target'], df['caffeine_intake'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.0000
P-value: 1.0000
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [101]:
#family_history
crosstab_result = pd.crosstab(df['target'], df['family_history'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.0123
P-value: 0.9117
Interpretation: Fail to reject the null hypothesis — no significant association ❌


In [102]:
#pet_owner
crosstab_result = pd.crosstab(df['target'], df['pet_owner'])
chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
chi2_statistic, chi2_p_value, expected_freq

print(f"Chi-square Statistic: {chi2_statistic:.4f}")
print(f"P-value: {chi2_p_value:.4f}")

if chi2_p_value < 0.05:
    print("Interpretation: Reject the null hypothesis — significant association found ✅")
else:
    print("Interpretation: Fail to reject the null hypothesis — no significant association ❌")

Chi-square Statistic: 0.3728
P-value: 0.5415
Interpretation: Fail to reject the null hypothesis — no significant association ❌


Cramér's V 

Measures the strength of the association between two variables.

In [103]:
table = pd.crosstab(df['sleep_quality'], df['job_type'])
association(table, method="cramer")

0.019302092491628568

The two variables are statistically independent in your data.
No strong correlation

In [104]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:", numerical_cols.tolist())

Numerical columns: ['age', 'height', 'weight', 'bmi', 'waist_size', 'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin', 'sleep_hours', 'work_hours', 'physical_activity', 'daily_steps', 'calorie_intake', 'sugar_intake', 'water_intake', 'screen_time', 'stress_level', 'mental_health_score', 'meals_per_day', 'environmental_risk_score', 'daily_supplement_dosage']


In [105]:
ANOVA TEST_for numerical columns vs categorical
numerical_columns = ['age', 'height', 'weight', 'bmi', 'waist_size', 'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin', 'sleep_hours', 'work_hours', 'physical_activity', 'daily_steps', 'calorie_intake', 'sugar_intake', 'water_intake', 'screen_time', 'stress_level', 'mental_health_score', 'meals_per_day', 'environmental_risk_score', 'daily_supplement_dosage']
results = []

for num_col in numerical_cols:
    groups = [group[num_col].dropna().values for name, group in df.groupby('target')]
    stat, p = stats.f_oneway(*groups)
    
    results.append({
        'Numerical_Variable': num_col,
        'ANOVA_Statistic': stat,
        'p_value': p
    })

results_df = pd.DataFrame(results)
print(results_df.sort_values('p_value'))

SyntaxError: invalid syntax (226701058.py, line 1)

In [81]:
df_vif = pd.read_csv('../Data/Clean/Clean_data.csv')
df_vif

Unnamed: 0,survey_code,age,gender,height,weight,bmi,waist_size,blood_pressure,heart_rate,cholesterol,...,device_usage,healthcare_access,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,environmental_risk_score,daily_supplement_dosage,target
0,2,69,1,163.207380,97.799859,36.716278,85.598889,117.917986,66.463696,115.794002,...,,1,2,5,0,0,1,5.5,6.239340,healthy
1,11,36,0,154.018148,72.899782,30.731407,71.862592,111.870635,66.922530,179.722343,...,2.0,1,0,2,0,0,0,5.5,8.682837,healthy
2,18,57,1,163.716032,80.649500,30.089797,98.126084,126.777487,71.447455,229.831881,...,0.0,1,0,5,1,1,0,5.5,-1.745223,healthy
3,19,41,1,176.073128,66.658527,21.501539,76.524816,115.339115,72.250183,228.258535,...,0.0,0,2,4,0,1,1,5.5,4.738326,healthy
4,28,19,0,159.247214,49.841623,19.653889,75.533214,128.053824,68.422487,201.166287,...,,2,1,2,0,0,0,5.5,5.085959,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17572,99987,62,0,168.934896,78.042746,27.346000,76.433630,91.196569,95.587320,148.187423,...,0.0,1,1,5,1,0,1,5.5,2.780969,diseased
17573,99988,37,1,162.510812,64.955967,24.595436,76.825530,100.501124,75.926155,249.965213,...,0.0,2,0,1,1,0,0,5.5,7.400583,healthy
17574,99990,24,1,178.190353,79.022329,24.887510,103.810718,106.953770,78.243143,235.859473,...,0.0,2,2,5,1,1,1,5.5,0.190082,healthy
17575,99995,41,0,155.398936,57.362689,23.753818,77.105189,113.376626,74.048029,175.694132,...,2.0,1,0,4,0,1,0,5.5,7.336399,healthy


In [107]:
VIF
df_numeric = df_vif.select_dtypes(include=['number']).drop(columns=['survey_code'])  # keep ID out

# Optional: Handle missing values (e.g., fill or drop)
df_numeric = df_numeric.dropna()  # or use df_numeric.fillna(df_numeric.mean())

# Standardize data (optional but helps with interpretation)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_numeric)

# Create a DataFrame for scaled data
X_scaled_df = pd.DataFrame(X_scaled, columns=df_numeric.columns)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data["feature"] = X_scaled_df.columns
vif_data["VIF"] = [variance_inflation_factor(X_scaled_df.values, i) for i in range(X_scaled_df.shape[1])]

# Sort by VIF descending
vif_data = vif_data.sort_values(by="VIF", ascending=False)

print(vif_data)

                     feature        VIF
4                        bmi  70.438614
3                     weight  54.089881
2                     height  17.859568
23       mental_health_score   1.004457
17              sugar_intake   1.004314
16            calorie_intake   1.004286
26                 diet_type   1.004155
12             sleep_quality   1.004071
32           caffeine_intake   1.003820
13                work_hours   1.003765
15               daily_steps   1.003754
1                     gender   1.003722
34                 pet_owner   1.003653
24     mental_health_support   1.003594
29         healthcare_access   1.003506
30         sunlight_exposure   1.003411
20              water_intake   1.003228
36   daily_supplement_dosage   1.003182
5                 waist_size   1.003180
10                   insulin   1.003165
11               sleep_hours   1.003014
8                cholesterol   1.002990
7                 heart_rate   1.002889
6             blood_pressure   1.002887


  return 1 - self.ssr/self.uncentered_tss


In [115]:
target = 'target'
results = []

for var in categorical_cols:
    try:
        crosstab_result = pd.crosstab(df['target'], df[var])
        chi2_statistic, chi2_p_value, _, expected_freq = chi2_contingency(crosstab_result)
        results.append({
            'Variable': var,
            'Chi2 Statistic': round(chi2_statistic, 4),
            'P-value': round(chi2_p_value, 4),
            'Significant': 'Yes' if chi2_p_value < 0.05 else 'No'
        })
    except:
        continue

chi_df = pd.DataFrame(results)
chi_df



Unnamed: 0,Variable,Chi2 Statistic,P-value,Significant
0,gender,0.4714,0.4923,No
1,sleep_quality,9.4999,0.0233,Yes
2,alcohol_consumption,0.1902,0.6627,No
3,smoking_level,3.2871,0.1933,No
4,mental_health_support,0.5606,0.454,No
5,job_type,13.0694,0.0227,Yes
6,diet_type,4.3133,0.2296,No
7,exercise_type,0.8141,0.6656,No
8,device_usage,5.5859,0.0612,No
9,healthcare_access,1.598,0.4498,No


In [116]:
chi_df = chi_df.drop(14)
chi_df

Unnamed: 0,Variable,Chi2 Statistic,P-value,Significant
0,gender,0.4714,0.4923,No
1,sleep_quality,9.4999,0.0233,Yes
2,alcohol_consumption,0.1902,0.6627,No
3,smoking_level,3.2871,0.1933,No
4,mental_health_support,0.5606,0.454,No
5,job_type,13.0694,0.0227,Yes
6,diet_type,4.3133,0.2296,No
7,exercise_type,0.8141,0.6656,No
8,device_usage,5.5859,0.0612,No
9,healthcare_access,1.598,0.4498,No


In [117]:
chi_df.to_csv("chi2_results.csv", index=False)