## Linear regression

In [1]:
from sklearn.datasets import  fetch_california_housing
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [2]:
df= pd.read_csv('../Data/Raw/health_lifestyle_classification.csv')

# Dropping columns that we consider unnecessary and then dropping NA

In [3]:
df.columns

Index(['survey_code', 'age', 'gender', 'height', 'weight', 'bmi',
       'bmi_estimated', 'bmi_scaled', 'bmi_corrected', 'waist_size',
       'blood_pressure', 'heart_rate', 'cholesterol', 'glucose', 'insulin',
       'sleep_hours', 'sleep_quality', 'work_hours', 'physical_activity',
       'daily_steps', 'calorie_intake', 'sugar_intake', 'alcohol_consumption',
       'smoking_level', 'water_intake', 'screen_time', 'stress_level',
       'mental_health_score', 'mental_health_support', 'education_level',
       'job_type', 'occupation', 'income', 'diet_type', 'exercise_type',
       'device_usage', 'healthcare_access', 'insurance', 'sunlight_exposure',
       'meals_per_day', 'caffeine_intake', 'family_history', 'pet_owner',
       'electrolyte_level', 'gene_marker_flag', 'environmental_risk_score',
       'daily_supplement_dosage', 'target'],
      dtype='object')

In [4]:
columns_to_drop=['gene_marker_flag', 'daily_supplement_dosage', 'income','survey_code', 'bmi_estimated', 'bmi_scaled',
                 'bmi_corrected','insurance','occupation','electrolyte_level','education_level', 
                 'pet_owner', 'caffeine_intake', 'mental_health_support', 'meals_per_day', 'sunlight_exposure', 'height', 'weight']

In [5]:
df_2=df.drop(columns_to_drop,axis=1)
df_2=df_2.dropna()
# df_2.info()

In [6]:
df_2.select_dtypes('number').columns

Index(['age', 'bmi', 'waist_size', 'blood_pressure', 'heart_rate',
       'cholesterol', 'glucose', 'insulin', 'sleep_hours', 'work_hours',
       'physical_activity', 'daily_steps', 'calorie_intake', 'sugar_intake',
       'water_intake', 'screen_time', 'stress_level', 'mental_health_score',
       'environmental_risk_score'],
      dtype='object')

# Eliminating outliers

In [7]:
df_2=df_2[(df_2['insulin'] >= 0) | (df_2['sugar_intake'] >= 0) | (df_2['bmi'] <= 49.71) | (df_2['cholesterol'] >= 83) 
          | (df_2['bmi'] <= 16.000) |(df_2['glucose'] <= 169.0) | ((df_2['heart_rate'] >= 42) & (df_2['heart_rate'] <= 109)) 
          | (df_2['physical_activity'] <= 9) | (df_2['physical_activity'] <= 15) | (df_2['insulin'] <= 31) 
          | (df_2['bmi'] <= 16.000) |(df_2['glucose'] <= 169.0) | ((df_2['heart_rate'] >= 42) & (df_2['heart_rate'] <= 109)) 
          | (df_2['physical_activity'] <= 9) | (df_2['physical_activity'] <= 15) | (df_2['sugar_intake'] <= 125.3)
          | ((df_2['waist_size'] >= 43) & (df_2['waist_size'] <= 123))|(df_2['water_intake'] <= 4.200)|(df_2['water_intake'] <= 119)
          |((df_2['work_hours'] >= 2) & (df_2['work_hours'] <= 14.5))]

# Identifying object columns and dividing them into ordinals and not ordinals

In [8]:
df_2.select_dtypes('object').columns

Index(['gender', 'sleep_quality', 'alcohol_consumption', 'smoking_level',
       'job_type', 'diet_type', 'exercise_type', 'device_usage',
       'healthcare_access', 'family_history', 'target'],
      dtype='object')

In [9]:
df_2['device_usage'].unique()

array(['Moderate', 'Low', 'High'], dtype=object)

In [10]:
object_col_non_ord=['gender', 'job_type', 'diet_type', 'exercise_type', 'family_history']
object_col_ord=['sleep_quality','smoking_level','device_usage','healthcare_access', 'alcohol_consumption']

In [11]:
alcohol_mapping = {
    'Regularly': 1,
    'Occasionally': 0
}
df_2['alcohol_consumption'] = df_2['alcohol_consumption'].map(alcohol_mapping)

In [12]:
device_usage_map = {'Low': 0,
    'Moderate': 1,
    'High': 2
}  
df_2['device_usage'] = df_2['device_usage'].map(device_usage_map)

In [13]:
quality_mapping = {
    'Poor': 0,
    'Fair': 1,
    'Good': 2,
    'Excellent': 3
}
df_2['sleep_quality'] = df_2['sleep_quality'].map(quality_mapping)

In [14]:
smoking_map = {
    'Non-smoker': 0,
    'Light': 1,
    'Heavy': 2
}
df_2['smoking_level'] = df_2['smoking_level'].map(smoking_map)

In [15]:
healthcare_access_map = {
    'Poor': 0,
    'Moderate': 1,
    'Good': 2
}
df_2['healthcare_access'] = df_2['healthcare_access'].map(healthcare_access_map)

# Training the encoder

In [16]:
ohe = OneHotEncoder(drop='first',sparse_output=False)
ohe.fit(df_2[object_col_non_ord])

# Doing the Split

In [17]:
features = df_2.drop(columns='target')
target = df_2["target"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)
# X_train.info()

In [18]:
X_train_num=X_train.drop(object_col_non_ord, axis=1)
X_test_num= X_test.drop(object_col_non_ord, axis=1)
# X_train_num.info()

# We are creating the new DF after enconding and we are overwriting the X_train and X_test to avoid doing the split again

In [19]:
X_train_trans_np = ohe.transform(X_train[object_col_non_ord])
X_train_trans_df = pd.DataFrame(X_train_trans_np, columns=ohe.get_feature_names_out(), index=X_train.index)
X_train=pd.concat([X_train_trans_df,X_train_num],axis=1)


In [20]:
X_test_trans_np = ohe.transform(X_test[object_col_non_ord])
X_test_trans_df = pd.DataFrame(X_test_trans_np, columns=ohe.get_feature_names_out(), index=X_test.index)
X_test=pd.concat([X_test_trans_df,X_test_num],axis=1)
# X_test.info()

In [21]:
X_test.describe()

Unnamed: 0,gender_Male,job_type_Labor,job_type_Office,job_type_Service,job_type_Tech,job_type_Unemployed,diet_type_Omnivore,diet_type_Vegan,diet_type_Vegetarian,exercise_type_Mixed,...,sugar_intake,alcohol_consumption,smoking_level,water_intake,screen_time,stress_level,mental_health_score,device_usage,healthcare_access,environmental_risk_score
count,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,...,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0,5282.0
mean,0.506437,0.167739,0.162249,0.168307,0.168875,0.171147,0.246308,0.244604,0.250284,0.325634,...,60.14605,0.490155,0.996592,2.001751,6.052267,5.054903,5.053957,1.010413,1.018743,5.5
std,0.500006,0.37367,0.368714,0.374175,0.374677,0.376673,0.430901,0.429893,0.433218,0.468656,...,19.636672,0.49995,0.812926,0.696285,2.988256,3.147515,3.191532,0.819016,0.824629,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-15.852305,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,5.5
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,46.689147,0.0,0.0,1.527498,3.967581,2.0,2.0,0.0,0.0,5.5
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,60.320956,0.0,1.0,2.000563,6.034931,5.0,5.0,1.0,1.0,5.5
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,73.242882,1.0,2.0,2.449123,8.082548,8.0,8.0,2.0,2.0,5.5
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,133.281884,1.0,2.0,5.0,16.0,10.0,10.0,2.0,2.0,5.5


In [22]:
col_to_minmax=['bmi', 'waist_size', 'blood_pressure','heart_rate', 'cholesterol', 'glucose', 'insulin', 'sleep_hours','work_hours', 'physical_activity',
               'daily_steps','calorie_intake', 'sugar_intake', 'water_intake', 'screen_time', 'stress_level','mental_health_score', 'environmental_risk_score']
X_train_minmax=X_train[col_to_minmax]
X_test_minmax=X_test[col_to_minmax]
X_train_minmax_opposite=X_train.drop(col_to_minmax,axis=1)
X_test_minmax_opposite=X_test.drop(col_to_minmax,axis=1)

# Applying the minmaxscaler only to columns that still need to be normalized

In [23]:
normalizer = MinMaxScaler()
normalizer.fit(X_train_minmax)

In [24]:
X_train_norm = normalizer.transform(X_train_minmax)
X_test_norm = normalizer.transform(X_test_minmax)

X_train_norm_df = pd.DataFrame(X_train_norm, columns=X_train_minmax.columns, index=X_train_minmax.index )
X_test_norm_df = pd.DataFrame(X_test_norm, columns=X_test_minmax.columns, index=X_test_minmax.index)

# Now we concat again

In [25]:
X_train_final=pd.concat([X_train_norm_df,X_train_minmax_opposite],axis=1)
X_test_final=pd.concat([X_test_norm_df,X_test_minmax_opposite],axis=1)

# We now apply the scaler

In [26]:
scaler = StandardScaler()

scaler.fit(X_train_final)

In [30]:
X_train_standarized_np = scaler.transform(X_train_final)
X_test_standarized_np = scaler.transform(X_test_final)

X_train_standarized_df = pd.DataFrame(X_train_standarized_np, columns = X_train_final.columns, index=X_train_final.index)
X_test_standarized_df  = pd.DataFrame(X_test_standarized_np, columns = X_test_final.columns, index=X_test_final.index)

# Turning Y into boolean

In [34]:
y_train_map = {
    'healthy': 0,
    'diseased': 1
}
y_train_bool= y_train.map(y_train_map)
y_test_bool=y_test.map(y_train_map)

# Now we balance over and under!

In [35]:
over = RandomOverSampler(random_state=0)
X_train_over, y_train_over = over.fit_resample(X_train_final,y_train_bool)
under = RandomUnderSampler(random_state=0)
X_train_under, y_train_under = under.fit_resample(X_train_final,y_train_bool)

# Defining the instances

In [36]:
lin_reg = LinearRegression()
lin_reg2 = LinearRegression()

In [37]:
lin_reg.fit(X_train_final, y_train_bool)
lin_reg2.fit(X_train_standarized_df, y_train_bool)

In [38]:
import pickle

with open("linear_model.pkl", "wb") as file:
    pickle.dump(lin_reg, file)

with open("min_max_scaler.pkl", "wb") as file:
    pickle.dump(normalizer, file)

# Evaluate model's performance

In [41]:
y_pred_test = lin_reg.predict(X_test_final)

print(f"Linear regression results:")
print(f"MAE {mean_absolute_error(y_pred_test, y_test_bool): .2f}") 
print(f"MSE {mean_squared_error(y_pred_test, y_test_bool): .2f}") 
print(f"RMSE, {root_mean_squared_error(y_pred_test, y_test_bool): .2f}") 
print(f"R2 score, {lin_reg.score(X_test_final, y_test_bool): .2f}")

Linear regression results:
MAE  0.41
MSE  0.21
RMSE,  0.45
R2 score, -0.00
