In [12]:
#Import Libraries

import pandas as pd
import numpy as np


In [13]:
#Read and identify the dataset

raw_data = pd.read_csv('raw-data\\sleep_health_and_lifestyle_dataset.csv')
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [14]:
#Clean the dataset

data_edited = raw_data.drop_duplicates() #Remove duplicates

columns = data_edited.columns.tolist()
no_null_columns = columns.copy()
no_null_columns.remove('Sleep Disorder')
data_edited = data_edited.dropna(subset = no_null_columns) #Drop null entries

data_edited.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           155 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [15]:
#Split Blood Pressure value into two columns as numeric values

data_edited[['BP_Systolic','BP_Diastolic']] = data_edited["Blood Pressure"].str.split('/',expand=True)
del data_edited['Blood Pressure']

data_edited['BP_Systolic'] = pd.to_numeric(data_edited['BP_Systolic'])
data_edited['BP_Diastolic'] = pd.to_numeric(data_edited['BP_Diastolic'])

data_edited.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Heart Rate               374 non-null    int64  
 10  Daily Steps              374 non-null    int64  
 11  Sleep Disorder           155 non-null    object 
 12  BP_Systolic              374 non-null    int64  
 13  BP_Diastolic             374 non-null    int64  
dtypes: float64(1), int64(9), o

In [16]:
#Check data for typos

object_column_list = data_edited.select_dtypes(include=object).columns.tolist()

for column in object_column_list :
    print(column,"\t", data_edited[column].unique(),"\n")

Gender 	 ['Male' 'Female'] 

Occupation 	 ['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Salesperson' 'Manager'] 

BMI Category 	 ['Overweight' 'Normal' 'Obese' 'Normal Weight'] 

Sleep Disorder 	 [nan 'Sleep Apnea' 'Insomnia'] 



In [17]:
#Ensure consistency in non-numeric data

data_edited['BMI Category'] = data_edited['BMI Category'].replace({'Normal Weight': 'Normal'})
data_edited['Occupation'] = data_edited['Occupation'].replace({'Salesperson' : 'Sales Representative'})
data_edited['Sleep Disorder'] = data_edited['Sleep Disorder'].replace({np.nan : 'No disorder'})

for column in object_column_list :
    print(column,"\t", data_edited[column].unique(),"\n")

data_edited.info()

Gender 	 ['Male' 'Female'] 

Occupation 	 ['Software Engineer' 'Doctor' 'Sales Representative' 'Teacher' 'Nurse'
 'Engineer' 'Accountant' 'Scientist' 'Lawyer' 'Manager'] 

BMI Category 	 ['Overweight' 'Normal' 'Obese'] 

Sleep Disorder 	 ['No disorder' 'Sleep Apnea' 'Insomnia'] 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Heart Rate     

In [18]:
#Normalize numerical data

numerical_input_parameters = data_edited.select_dtypes(include=np.number).columns.tolist() 
numerical_input_parameters.remove('Person ID')
print(numerical_input_parameters)

for column in numerical_input_parameters:
    #Max scaling
    exec(f"data_edited['{column}'] = (data_edited['{column}']-data_edited['{column}'].min())/(data_edited['{column}'].max()-data_edited['{column}'].min())")
    
    #Min-Max Normalization
    #exec(f"data_edited['{column}'] = (data_edited['{column}']-data_edited['{column}'].min())/(data_edited['{column}'].max()-data_edited['{column}'].min())")

data_edited.head()

['Age', 'Sleep Duration', 'Quality of Sleep', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'BP_Systolic', 'BP_Diastolic']


Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,BP_Systolic,BP_Diastolic
0,1,Male,0.0,Software Engineer,0.111111,0.4,0.2,0.6,Overweight,0.571429,0.171429,No disorder,0.407407,0.4
1,2,Male,0.03125,Doctor,0.148148,0.4,0.5,1.0,Normal,0.47619,1.0,No disorder,0.37037,0.25
2,3,Male,0.03125,Doctor,0.148148,0.4,0.5,1.0,Normal,0.47619,1.0,No disorder,0.37037,0.25
3,4,Male,0.03125,Sales Representative,0.037037,0.0,0.0,1.0,Obese,0.952381,0.0,Sleep Apnea,0.925926,0.75
4,5,Male,0.03125,Sales Representative,0.037037,0.0,0.0,1.0,Obese,0.952381,0.0,Sleep Apnea,0.925926,0.75


In [19]:
#Remove results column (to add later to the end)

output_column = data_edited['Sleep Disorder']
del data_edited['Sleep Disorder']

In [20]:
#One hot encoding

object_input_list = data_edited.select_dtypes(include=object).columns.tolist()
data_encoded = data_edited

data_encoded = pd.get_dummies(data_encoded,columns=object_input_list)
data_encoded['Sleep_Disorder'] = output_column

data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 26 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Person ID                        374 non-null    int64  
 1   Age                              374 non-null    float64
 2   Sleep Duration                   374 non-null    float64
 3   Quality of Sleep                 374 non-null    float64
 4   Physical Activity Level          374 non-null    float64
 5   Stress Level                     374 non-null    float64
 6   Heart Rate                       374 non-null    float64
 7   Daily Steps                      374 non-null    float64
 8   BP_Systolic                      374 non-null    float64
 9   BP_Diastolic                     374 non-null    float64
 10  Gender_Female                    374 non-null    bool   
 11  Gender_Male                      374 non-null    bool   
 12  Occupation_Accountant 

In [21]:
#Save to a csv file

data_encoded.to_csv('edited-data\\sleep_data.csv', index=False)