In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Welcome to our sleep saga!
Let's explore together some data and discover new correlations in sleep and productivity data :)

# Import data 

Firstly we need to import our data by using pd.read_csv function. 
Don't forget to check the file path!

In [None]:
data = pd.read_csv('/kaggle/input/sleep-cycle-and-productivity/sleep_cycle_productivity.csv')

# **Data Understanding**

Before diving any deeper let's take a first look into our data.

In [None]:
data.shape

It means that we have 5000 rows and 15 columns.

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
data.describe()

**Explanation of the Output:**
* count: number of non-null values
* mean: average of the values
* std: standard deviation (spread of data)
* min: minimum value
* 25%: first quartile
* 50%: median value
* 75%: third quartile
* max: maximum value

# **Data Preparation**

## Nulls check

In [None]:
data.isnull().sum()

Our data has no null values.

## **Create Datetime Columns**

We know that our sleep start time and sleep end time columns are float format, where the integer part represents the hour and the declimal part represents the fractional part of the hour. Let's convert them using datetime function. 

In [None]:
import datetime
data['Sleep Start Time'] = data['Sleep Start Time'].apply(lambda x: datetime.time(int(x), int((x % 1) * 60)))
data['Sleep Start Time'].head(5)

In [None]:
data['Sleep End Time'] = data['Sleep End Time'].apply(lambda x: datetime.time(int(x), int((x % 1) * 60)))
data['Sleep End Time'].head(5)

## **Data Visualisation**

In [None]:
plt.figure(figsize = (10,6))
sns.boxplot(data = data, x= 'Age', y = 'Sleep Quality')
plt.title('Sleep Quality by Age')
plt.show()

For better visibility let's make a function that groups ages into age groups. 

In [None]:
def age_group(age):
        if age>= 18 and age <= 30:
           return '18-30'
        elif age > 30 and age <= 40:
           return  '30-40'
        else:
           return  '40-59'
data['Age Groups'] = data['Age'].apply(age_group)
data['Age Groups'].head(5)

In [None]:
sns.boxplot(data = data, x = 'Age Groups', y = 'Sleep Quality')
plt.title('Sleep Quality in different Age Groups')
plt.show()

We can see that sleep quality is the worst if you are older than 40 and younger than 59.
Now let's se how the amount of consumed caffeine affects our sleep quality. The common espresso contains around 60 mg of caffeine. Let's use this information to see how the number of espresso shots affects our sleep quality. 

In [None]:
def espresso_amount(caffeine):
    if caffeine >= 0 and caffeine < 60:
        return '1'
    elif caffeine >= 60 and caffeine < 120:
        return '2'
    elif caffeine >= 120 and caffeine <180:
        return '3'
    elif caffeine >= 180 and caffeine <240:
        return '4'
    else:
        return '5'
data['Espresso Shots'] = data['Caffeine Intake (mg)'].apply(espresso_amount)

In [None]:
sns.lineplot(data = data, x = 'Espresso Shots', y = 'Sleep Quality', marker = 'o')
plt.title('The effect of drunk espresso on sleep quality')
plt.show()

Interestingly it is the best for our sleep quality to drink between 2 to 3 espresso shots per day. 

Now let's see what else we can explore by looking at the heatmap. 

In [None]:
data_numeric = data.select_dtypes(include = 'number')
corr = data_numeric.corr()
sns.heatmap(corr, annot = True, cmap = 'coolwarm', fmt = '.2f')
plt.show()

We can see that there is a negative correlation between work hours and productivity score. 
Let's see it!

In [None]:
data['Work Hours (int)'] = data['Work Hours (hrs/day)'].round(0)
data['Work Hours (int)'].head(5)

In [None]:
sns.lineplot(data = data, x = 'Work Hours (int)', y = 'Productivity Score')
plt.show()

Our productivity score decreases as the the number of work hours increases.
Now let'see how excerices affect our productivity.

In [None]:
data['Exercise (hours/day)'] = data['Exercise (mins/day)'].apply(lambda x: round(x/60,1))
data['Exercise (hours/day)'].head(5)

In [None]:
sns.boxplot(data = data, x = 'Exercise (hours/day)', y = 'Productivity Score')
plt.show()

Now we discovered the other interesting fact!
Accordingly to this plot,working out for around 1 or 1,5 hour per day negatively impacts our productivity score. 

# Summary

Let's sum up what we discovered:
* our sleep quality varies depending on the sleep age (the worst is for ages between 40 and 59),
* it is better to drink 2 to 3 shots of espresso per day,
* our productivity score decreases as the the number of work hours increases,
* working out for around 1 or 1,5 hour per day negatively impacts our productivity score

I am still learning so I would be grateful for any comments! :)