In [None]:
# Import libraries
import csv
import sys, os
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from datetime import timedelta
from collections import deque

In [None]:
# Read data from the SQLite database
db_connection = sqlite3.connect('../data/fitbit.db')
data = pd.read_sql("select * from daily_summary",db_connection)
print(data.size)
data.head(3)

In [None]:
#Remove empty entrie
data = data[data['Steps'].notnull()]

In [None]:
# Show distribution of all numerical value in the dataset
h = data.hist(figsize = (15,20), layout = (11,5), xrot = 30)

In [None]:
# DATA WRANGLING

# Create a weekday/day/month labes
data['Weekday'] = data['Date'].map(lambda x: (datetime.strptime(str(x),"%Y-%m-%d")).weekday() , na_action = 'ignore')
data['Day'] = data['Date'].map(lambda x: (datetime.strptime(str(x),"%Y-%m-%d")).day , na_action = 'ignore')
data['Month'] = data['Date'].map(lambda x: (datetime.strptime(str(x),"%Y-%m-%d")).month , na_action = 'ignore')

# Percentage of awake time to time in bed (related to efficiency)
data['Awake Percentage'] = data['Sleep Minutes Awake']/data['Time in Bed']*100 

# Sleep start hour. 
dt_format = "%Y-%m-%dT%H:%M:%S.%f"
data['Sleep Start Hour'] = data['Sleep Start Time'].map(lambda x: (datetime.strptime(str(x),dt_format)).hour+(datetime.strptime(str(x),dt_format)).minute/60.0, na_action = 'ignore')
# If the hour is after noon, then subtract 12 so that midnight becomes the 0 reference.

# Midnight is the baseline so that hours can be either + or - from midnight
ind = data[data['Sleep Start Hour'] > 12.0].index.tolist()
data.loc[ind,'Sleep Start Hour'] = data['Sleep Start Hour'].iloc[ind] - 24.0

#Waking up time
data['Wake Up Hour'] = data['Sleep Start Hour']+data['Time in Bed']/60

In [None]:
# DEBUF INFO - Show all column names
#list(data)
#data.head(5)

In [None]:
# Clean up data 
data = data[(data['Wake Up Hour'] >= 4) & (data['Wake Up Hour'] <= 12)]
data = data[(data['Sleep Start Hour'] >= -4) & (data['Sleep Start Hour'] <= 4)]
# Looking at wake up time and sleep times.
fig,axes = plt.subplots(figsize = (12,4),nrows = 1, ncols = 2)

plt.sca(axes[1])
h = data['Wake Up Hour'].hist(alpha = 0.8)
tt = plt.title('Histogram of Wake Up Time')
plt.xlabel('Wake up time AM')
plt.ylabel('Number of days')

plt.sca(axes[0])
data['Sleep Start Hour'].hist(alpha = 0.8)
xt = plt.xticks([-3,-2,-1,0,1,2,3],['9pm','10pm','11pm','Midn','1am','2am','3am'])
tt = plt.title('Histogram of Start Sleep')
yy = plt.ylabel('Number of days')

In [None]:
# Looking at variations based on weekday

weekday_stps = data['Steps'].groupby(data['Weekday']).median()
sleep_minutes_asleep_med = data['Sleep Minutes Asleep'].groupby(data['Weekday']).median()/60
sl_eff = (1-data['Sleep Minutes Asleep']/data['Time in Bed'])*100
sl = sl_eff.groupby(data['Weekday']).median()
wak = data['Sleep Awake Count'].groupby(data['Weekday']).median()
sl_minutes_inbed = data['Time in Bed'].groupby(data['Weekday']).median()
awkmin_per = data['Awake Percentage'].groupby(data['Weekday']).median()

In [None]:
# Function to clean up plots
def prepare_plot_area(ax):
    # Remove plot frame lines
    ax.spines["top"].set_visible(False)  
    ax.spines["right"].set_visible(False)  
    ax.spines["left"].set_visible(False) 
    
    # X and y ticks on bottom and left
    ax.get_xaxis().tick_bottom()  
    ax.get_yaxis().tick_left()  
    
# Defining a color pattern that is pleasing
colrcode = [(31, 119, 180), (255, 127, 14),\
             (44, 160, 44), (214, 39, 40), \
             (148, 103, 189),  (140, 86, 75), \
             (227, 119, 194), (127, 127, 127), \
             (188, 189, 34), (23, 190, 207)]

for i in range(len(colrcode)):  
    r, g, b = colrcode[i]  
    colrcode[i] = (r / 255., g / 255., b / 255.)
    
# Median number of steps
fig,axes = plt.subplots(figsize=(12, 4), nrows=1, ncols=3)

ct = 0
plt.sca(axes[ct])
weekday_stps.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Median number of steps')
plt.title('Daily median number of steps walked')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])

# Median number of minutes slept
ct +=1
plt.sca(axes[ct])
sleep_minutes_asleep_med.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Median number of minutes slept')
plt.title('Daily median number of hours slept')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])

ct +=1
plt.sca(axes[ct])
sl.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Median sleep inefficiency')
plt.title('sleep inefficiency %')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])

fig,axes = plt.subplots(figsize=(12, 4), nrows=1, ncols=3)
ct =0
plt.sca(axes[ct])
wak.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Awakenings count')
plt.title('Number of wakenings')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])

ct += 1
plt.sca(axes[ct])
sl_minutes_inbed.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Minutes in bed')
plt.title('Time in bed')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])

ct += 1
plt.sca(axes[ct])
awkmin_per.plot(kind = 'bar',color = colrcode[0], alpha = 0.5)
plt.ylabel('Awake min %')
plt.title('Awake minutes/time in bed x 100')
plt.xticks(list(range(7)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
prepare_plot_area(axes[ct])


fig,axes = plt.subplots(figsize=(12, 4), nrows=1, ncols=2)
plt.sca(axes[0])
d = data.boxplot(column='Sleep Awake Count', by='Weekday', ax =axes[0])
d = plt.xticks(list(range(8)),['','Mon','Tue','Wed','Thur','Fri','Sat','Sun'])

plt.sca(axes[1])
d = data.boxplot(column='Awake Percentage', by='Weekday', ax =axes[1])
d = plt.xticks(list(range(8)),['','Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
plt.title('Minutes awake/minutes in bed x 100')

fig,axes = plt.subplots(figsize=(12, 4), nrows=1, ncols=2)
plt.sca(axes[0])
h = data.boxplot(column= 'Wake Up Hour', by = 'Weekday', ax = axes[0])
h = plt.xticks(list(range(8)),['','Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
h = plt.ylabel('Wake up time AM')

sl_hr = data['Sleep Start Hour'].groupby(data['Weekday']).median()+12
plt.sca(axes[1])
sl_hr.plot(kind = 'line',color = colrcode[2],alpha = 0.5,linewidth = 2, marker = 'o',markersize = 10)
d = plt.xticks(list(range(8)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
d = plt.yticks(np.linspace(9,16,8),['9 pm','10pm','11pm','Mid','1am','2am','3am','4am'])
plt.scatter(data['Weekday'],data['Sleep Start Hour']+12, color = colrcode[1])
plt.title('Sleep start hour')


plt.ylim = (9,15)

In [None]:
# Looking at variations across months

sl_st_mon = data['Sleep Start Hour'].groupby(data['Month']).median()
fig,axes = plt.subplots(figsize = (12,4), nrows = 1, ncols = 2)
plt.sca(axes[0])
#s = plt.scatter(data['Month']-1,data['Sleep Start Hour'], color = colrcode[1])
d = data.boxplot(column='Sleep Start Hour', by='Month', ax =axes[0])
d = plt.xticks(np.linspace(1,12,12),['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
d = plt.yticks(np.linspace(-3,4,8),['9 pm','10pm','11pm','Mid','1am','2am','3am','4am'])
plt.title('Sleep start over months')

plt.sca(axes[1])
d = data.boxplot(column='Wake Up Hour', by='Month', ax =axes[1])
#s = plt.scatter(data['Month']-1,data['Wake Up Hour'], color = colrcode[1])
d = plt.xticks(np.linspace(1,12,12),['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

st_mon = data['Steps'].groupby(data['Month']).median()
fig,axes = plt.subplots(figsize = (12,4), nrows = 1, ncols = 2)
plt.sca(axes[0])
#plt.scatter(data['Month']-1,data['Steps'], color = colrcode[1])
d = data.boxplot(column='Steps', by='Month', ax =axes[0])
d = plt.xticks(np.linspace(1,12,12),['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.title('Step count over months')

plt.sca(axes[1])
d = data.groupby(['Month','Weekday'])['Wake Up Hour'].median()
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec']
for i in range(1,13):
    d[i].plot(kind = 'line',label = months[i-1])
h = plt.legend(loc='lower right')
d = plt.xticks(list(range(8)),['Mon','Tue','Wed','Thur','Fri','Sat','Sun'])
plt.title('Wake hour over the week sorted by month')

fig = plt.figure(figsize = (12,4))
ax = fig.add_subplot(121)
st_mon.plot(kind = 'bar', alpha = 0.5, color = colrcode[0])
d = plt.xticks(np.linspace(0,11,11),['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.title('Median step count over the months')

el_mon = data['Elevation'].groupby(data['Month']).median()
ax2 = fig.add_subplot(122)
el_mon.plot(kind = 'bar', alpha = 0.5, color = colrcode[0])
d = plt.xticks(np.linspace(0,12,12),['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
plt.title('Median elevation climbed over the months')

In [None]:
# Checking correlations

df_subset = data[['Steps','Floors','Sedentary Minutes','Activity Calories','Sleep Minutes Asleep','Sleep Awake Count',\
                     'Weekday','Resting Heart Rate','Sleep Minutes Awake','Sleep Efficiency']]
 
axes = pd.plotting.scatter_matrix(df_subset, figsize = (15,20), alpha=0.5, diagonal='kde')

corr = df_subset.corr().as_matrix()
for i, j in zip(*plt.np.triu_indices_from(axes, k=1)):
    axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center')

In [None]:
plt.matshow(df_subset.corr())

In [None]:
plt.matshow(data.corr())

In [None]:
data1_15 = data.iloc[:,7:25]
plt.matshow(data1_15.corr())
data1_15.head()

In [None]:
axes = pd.plotting.scatter_matrix(df_subset, figsize = (15,20), alpha=0.5, diagonal='kde')

corr = data1_15.corr().as_matrix()
for i, j in zip(*plt.np.triu_indices_from(axes, k=1)):
    axes[i, j].annotate("%.3f" %corr[i,j], (0.8, 0.8), xycoords='axes fraction', ha='center', va='center')