In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, fetch_california_housing

import matplotlib.pyplot as plt
import seaborn as sns

# Rerading the file

In [2]:
df= pd.read_csv('../Data/Raw/health_lifestyle_classification.csv')
df

Unnamed: 0,survey_code,age,gender,height,weight,bmi,bmi_estimated,bmi_scaled,bmi_corrected,waist_size,...,sunlight_exposure,meals_per_day,caffeine_intake,family_history,pet_owner,electrolyte_level,gene_marker_flag,environmental_risk_score,daily_supplement_dosage,target
0,1,56,Male,173.416872,56.886640,18.915925,18.915925,56.747776,18.989117,72.165130,...,High,5,Moderate,No,Yes,0,1.0,5.5,-2.275502,healthy
1,2,69,Female,163.207380,97.799859,36.716278,36.716278,110.148833,36.511417,85.598889,...,High,5,High,Yes,No,0,1.0,5.5,6.239340,healthy
2,3,46,Male,177.281966,80.687562,25.673050,25.673050,77.019151,25.587429,90.295030,...,High,4,Moderate,No,No,0,1.0,5.5,5.423737,healthy
3,4,32,Female,172.101255,63.142868,21.318480,21.318480,63.955440,21.177109,100.504211,...,High,1,,No,Yes,0,1.0,5.5,8.388611,healthy
4,5,60,Female,163.608816,40.000000,14.943302,14.943302,44.829907,14.844299,69.021150,...,High,1,High,Yes,Yes,0,1.0,5.5,0.332622,healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99996,53,Male,177.202253,54.303671,17.293811,17.293811,51.881433,17.227616,88.740028,...,Moderate,1,High,No,Yes,0,1.0,5.5,3.477124,healthy
99996,99997,22,Male,180.802297,40.033853,12.246712,12.246712,36.740135,12.159473,103.659560,...,Moderate,5,,Yes,No,0,1.0,5.5,1.609656,healthy
99997,99998,37,Male,185.540653,84.536847,24.556580,24.556580,73.669741,24.172944,98.920422,...,Low,5,,No,No,0,1.0,5.5,-9.736463,healthy
99998,99999,72,Female,181.796786,56.923335,17.223362,17.223362,51.670087,17.715475,54.559079,...,Low,4,High,Yes,Yes,0,1.0,5.5,-4.779376,healthy


In [3]:
df['physical_activity'].describe()

count    100000.000000
mean          3.038344
std           1.884475
min           0.000000
25%           1.633799
50%           2.971222
75%           4.326500
max          11.631898
Name: physical_activity, dtype: float64

# Dropping columns that we consider unnecessary and then dropping NA

In [4]:
columns_to_drop=['gene_marker_flag', 'income', 'bmi_estimated', 'bmi_scaled','bmi_corrected','insurance','occupation','electrolyte_level','education_level']

In [5]:
df_2=df.drop(columns_to_drop,axis=1)
df_2=df_2.dropna()
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17577 entries, 1 to 99998
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   survey_code               17577 non-null  int64  
 1   age                       17577 non-null  int64  
 2   gender                    17577 non-null  object 
 3   height                    17577 non-null  float64
 4   weight                    17577 non-null  float64
 5   bmi                       17577 non-null  float64
 6   waist_size                17577 non-null  float64
 7   blood_pressure            17577 non-null  float64
 8   heart_rate                17577 non-null  float64
 9   cholesterol               17577 non-null  float64
 10  glucose                   17577 non-null  float64
 11  insulin                   17577 non-null  float64
 12  sleep_hours               17577 non-null  float64
 13  sleep_quality             17577 non-null  object 
 14  work_hours 

# Converting Object columns into int.

In [6]:
df_2['gender'].unique()
gender_mapping = {'Male': 0,'Female': 1} 
df_2['gender']= df_2['gender'].map(gender_mapping)

In [7]:
df_2['caffeine_intake'].unique()
caffeine_intake_mapping = {'High': 0,'Moderate': 1} 
df_2['caffeine_intake']= df_2['caffeine_intake'].map(caffeine_intake_mapping)

In [8]:
df_2['family_history'].unique()
family_history_mapping = {'Yes': 0,'No': 1} 
df_2['family_history']= df_2['family_history'].map(family_history_mapping)

In [9]:
quality_mapping = {
    'Poor': 0,
    'Fair': 1,
    'Good': 2,
    'Excellent': 3
}
df_2['sleep_quality'] = df_2['sleep_quality'].map(quality_mapping)

In [10]:
alcohol_mapping = {
    'Regularly': 1,
    'Occasionally': 0
}
df_2['alcohol_consumption'] = df_2['alcohol_consumption'].map(alcohol_mapping)

In [11]:
pet_owner_mapping = {
    'No': 1,
    'Yes': 0
}
df_2['pet_owner'] = df_2['pet_owner'].map(pet_owner_mapping)

In [12]:
smoking_map = {
    'Non-smoker': 1,
    'Light': 2,
    'Heavy': 3
}

df_2['smoking_level'] = df_2['smoking_level'].map(smoking_map)

In [13]:
mental_health_map = {
    'No': 1,
    'Yes': 0,
}

df_2['mental_health_support'] = df_2['mental_health_support'].map(mental_health_map)

In [14]:
job_type_map = {
    'Office': 1,
    'Labor': 2,
    'Service': 3,
    'Unemployed': 4,
    'Healthcare': 5,
    'Tech': 6
}

df_2['job_type'] = df_2['job_type'].map(job_type_map)

In [15]:
diet_type_map = {
    'Vegan': 1,
    'Omnivore': 2,
    'Keto': 3,
    'Vegetarian': 4
}

df_2['diet_type'] = df_2['diet_type'].map(diet_type_map)

In [16]:
exercise_map = {
    'Cardio': 0,
    'Strength': 1,
    'Mixed': 2
}
df_2['exercise_type'] = df_2['exercise_type'].map(exercise_map)

In [17]:
healthcare_access_map = {
    'Poor': 0,
    'Moderate': 1,
    'Good': 2
}
df_2['healthcare_access'] = df_2['healthcare_access'].map(healthcare_access_map)

In [18]:
sunlight_exposure_map = {
    'Low': 0,
    'Moderate': 1,
    'High': 2
}
# Example for two columns:
df_2['sunlight_exposure'] = df_2['sunlight_exposure'].map(sunlight_exposure_map)

In [19]:
device_usage_map = {'Low': 0,
    'Medium': 1,
    'High': 2
}   
df_2['device_usage'] = df_2['device_usage'].map(device_usage_map)

In [20]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17577 entries, 1 to 99998
Data columns (total 39 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   survey_code               17577 non-null  int64  
 1   age                       17577 non-null  int64  
 2   gender                    17577 non-null  int64  
 3   height                    17577 non-null  float64
 4   weight                    17577 non-null  float64
 5   bmi                       17577 non-null  float64
 6   waist_size                17577 non-null  float64
 7   blood_pressure            17577 non-null  float64
 8   heart_rate                17577 non-null  float64
 9   cholesterol               17577 non-null  float64
 10  glucose                   17577 non-null  float64
 11  insulin                   17577 non-null  float64
 12  sleep_hours               17577 non-null  float64
 13  sleep_quality             17577 non-null  int64  
 14  work_hours 

In [22]:
df_2.to_csv('../Data/Clean/Clean_data.csv',index=False)