In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
pd.set_option('display.max_columns',200)
pd.set_option('display.max_rows',200)
import random
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'

In [3]:
df = pd.read_csv('Sleep_health_and_lifestyle_dataset.csv')
df.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Blood Pressure           374 non-null    object 
 10  Heart Rate               374 non-null    int64  
 11  Daily Steps              374 non-null    int64  
 12  Sleep Disorder           374 non-null    object 
dtypes: float64(1), int64(7), object(5)
memory usage: 38.1+ KB


In [5]:
df.shape

(374, 13)

By looking the at the info, we can see there are no null values in the dataset.<br>
So, we can move on to the next steps

In [6]:
copy = df.copy()
copy.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,3,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea


In [7]:
copy.Occupation.unique()

array(['Software Engineer', 'Doctor', 'Sales Representative', 'Teacher',
       'Nurse', 'Engineer', 'Accountant', 'Scientist', 'Lawyer',
       'Salesperson', 'Manager'], dtype=object)

<B>How much does anyone from each Occupation category sleep on average?<B>

In [12]:
OccsleepOnAvg = copy.groupby('Occupation')['Sleep Duration'].agg('mean').reset_index()

In [17]:
OccsleepOnAvg = OccsleepOnAvg.round(2)
OccsleepOnAvg = OccsleepOnAvg.sort_values('Sleep Duration',ascending = False)

In [18]:
OccsleepOnAvg

Unnamed: 0,Occupation,Sleep Duration
2,Engineer,7.99
3,Lawyer,7.41
0,Accountant,7.11
5,Nurse,7.06
1,Doctor,6.97
4,Manager,6.9
9,Software Engineer,6.75
10,Teacher,6.69
7,Salesperson,6.4
8,Scientist,6.0


In [25]:
random_colors = ['#%02x%02x%02x' % (random.randint(0, 255), random.randint(
    0, 255), random.randint(0, 255)) for _ in range(len(OccsleepOnAvg.Occupation))]

fig = go.Figure(data=go.Bar(x=OccsleepOnAvg.Occupation,
                            y=OccsleepOnAvg['Sleep Duration'], text=OccsleepOnAvg['Sleep Duration'], marker_color=random_colors))

# Customize the chart layout (optional)
fig.update_layout(title='Average Sleep Distribution',
                  xaxis_title='Occupation',
                  yaxis_title='Average Sleep Duration Per day(in Hours)')

# Show the chart
fig.show()

We can see, Engineers get the most sleep on average which is ~8 hrs/day.<br> While a Sales Representative gets least amount of sleep on average which is ~6 hrs/day.

In [36]:
genderOccupationSleep = copy.groupby(['Occupation','Gender'])['Sleep Duration'].agg('mean').reset_index()

In [37]:
genderOccupationSleep = genderOccupationSleep.round(2)

In [38]:
genderOccupationSleep

Unnamed: 0,Occupation,Gender,Sleep Duration
0,Accountant,Female,7.11
1,Accountant,Male,7.2
2,Doctor,Female,8.2
3,Doctor,Male,6.93
4,Engineer,Female,8.43
5,Engineer,Male,7.54
6,Lawyer,Female,7.15
7,Lawyer,Male,7.42
8,Manager,Female,6.9
9,Nurse,Female,7.06


In [44]:
pivot_df = genderOccupationSleep.pivot(index='Occupation', columns='Gender', values='Sleep Duration')

# Create the stacked bar chart
fig = go.Figure()

for gender in pivot_df.columns:
    fig.add_trace(go.Bar(x=pivot_df.index, y=pivot_df[gender], name=gender,text=pivot_df[gender]))

# Customize the chart layout (optional)
fig.update_layout(title='Sleep Duration by Occupation and Gender',
                  xaxis_title='Occupation',
                  yaxis_title='Sleep Duration',
                  barmode='stack')  # Set the barmode to 'stack' for stacked bars

# Show the chart
fig.show()

Engineer Male have the most amount of sleep on Average : 7.54 and Female : 8.43

In [45]:
copy['Quality of Sleep'].unique()

array([6, 4, 7, 5, 8, 9], dtype=int64)

Creating new columns which will represent Blood Pressure : Systolic, Diastolic <br>
and we can also create new binary column for sleep disorder

In [48]:
# this column will take the first value from Blood Pressure column which is also known as Systolic value
copy['Systolic:1st'] = copy['Blood Pressure'].apply(lambda x : int(x.split('/')[0]))

In [50]:
# this column will take the first value from Blood Pressure column which is also known as Diastolic value
copy['Diastolic:2nd'] = copy['Blood Pressure'].apply(lambda x : int(x.split('/')[1]))

In [53]:
# This column displays 1 if the user has any type of sleep disorder, otherwise 0
copy['Disorder:yn'] = copy['Sleep Disorder'].apply(lambda x : 1 if x != 'None' else 0)

In [75]:
# This column displays if the user has high Blood pressure or not
copy['High_BP'] = copy['Systolic:1st'] > 130
copy['Low_BP'] = copy['Systolic:1st'] < 90
copy['High_BP'] = copy['High_BP'].astype(int)
copy['Low_BP'] = copy['Low_BP'].astype(int)

# Does quality of sleep affect overall health?

In [86]:
healthIssuesSleep = copy.groupby('Quality of Sleep')[['High_BP','Low_BP','Disorder:yn']].agg('sum').reset_index()

In [87]:
healthIssuesSleep

Unnamed: 0,Quality of Sleep,High_BP,Low_BP,Disorder:yn
0,4,5,0,5
1,5,3,0,7
2,6,33,0,65
3,7,31,0,37
4,8,0,0,8
5,9,35,0,33


In [78]:
correlation = copy[['Quality of Sleep','High_BP','Low_BP']].corr()

In [79]:
correlation

Unnamed: 0,Quality of Sleep,High_BP,Low_BP
Quality of Sleep,1.0,-0.086483,
High_BP,-0.086483,1.0,
Low_BP,,,


In [101]:
avgAgeforSleepDis = copy.groupby('Sleep Disorder')[['Age','Systolic:1st','Diastolic:2nd']].agg('mean').reset_index()
avgAgeforSleepDis.round()

Unnamed: 0,Sleep Disorder,Age,Systolic:1st,Diastolic:2nd
0,Insomnia,44.0,132.0,87.0
1,,39.0,124.0,81.0
2,Sleep Apnea,50.0,138.0,93.0
