## Imports

In [1]:
import numpy as np
import pandas as pd

## Data

In [2]:
df = pd.read_excel('./datasets/2008SleepinAmericaPollSleepPerformanceandWorkplaceRawDataExcel.xls')

In [3]:
df.head()

Unnamed: 0,caseid,year,response,intvwr,market,region,tz,state,fips,dma,...,d3,d4a,d4b,d4c,d4d,d4e,d5,d6,d7,d8
0,7,2007,200709,951,ne_1,1,E,NH,33003,500,...,4,2,,,,,3,5,2,4
1,11,2007,200709,650,south_1,3,E,TN,47093,557,...,2,2,,,,,3,4,1,4
2,13,2007,200709,951,mw_1,2,E,OH,39135,542,...,1,1,1.0,2.0,1.0,2.0,3,5,1,6
3,14,2007,200709,969,mw_1,2,E,MI,26069,513,...,1,2,,,,,3,5,1,4
4,15,2007,200709,928,south_1,3,E,SC,45083,567,...,1,1,1.0,1.0,2.0,2.0,5,6,1,6


In [4]:
df.dtypes

caseid        int64
year          int64
response      int64
intvwr        int64
market       object
             ...   
d4e         float64
d5            int64
d6            int64
d7            int64
d8            int64
Length: 217, dtype: object

## Drop and Rename Columns

Each column in the data is labeled based on it's question number from the initial questionnaire. Useful columns pertaining to sleep, work times, income, marital status and age are kept and renamed appropriately.

In [5]:
#Drop unneccessary columns
cols_to_keep = ['q1','q2', 'q3','q4', 'q5', 'q6', 'q7', 'q9',
                'q10', 'qs2', 'qs3', 'q28a', 'q28b', 'q28c', 
                'q28d', 'q28e','q28f', 'q28g', 'd2', 'd3', 'd6', 'd7', 'd8']

df = df[cols_to_keep]

In [6]:
#Rename columns
df.rename(columns = {'q1':'awake_time',
                     'q2':'start_work', 
                     'q3':'end_work',
                     'q4':'bed_time_work_tomorrow', 
                     'q5':'no_work_awake_time', 
                     'q6':'bed_time_no_work_tomorrow', 
                     'q7':'usual_sleep_per_night', 
                     'q9': 'naps_per_month',
                     'q10': 'length_of_average_nap',
                     'qs2': 'employment_status', 
                     'qs3': 'avg_weekly_hours_worked', 
                     'q28a': 'treated_for_heart_disease', 
                     'q28b':'treated_for_high_blood_pressure', 
                     'q28c': 'treated_for_diabetes', 
                     'q28d':'treated_for_heartburn_GERD',
                     'q28e':'treated_for_arthritis',
                     'q28f': 'treated_for_depression',
                     'q28g': 'treated_for_anxiety',
                     'd2':'age', 
                     'd3':'marital_status',
                     'd6':'highest_edu', 
                     'd7':'single_dual_household', 
                     'd8':'house_hold_income_bracket'}, inplace = True)

## Nulls

There are no nulls except for in 'length_of_average_nap'. After going off the values in the dataset, as well the questionnaire, it appears that these were left blank because the participant does not nap. They are convereted to 0s.

In [7]:
#Check for nulls

df.isnull().sum()

awake_time                           0
start_work                           0
end_work                             0
bed_time_work_tomorrow               0
no_work_awake_time                   0
bed_time_no_work_tomorrow            0
usual_sleep_per_night                0
naps_per_month                       0
length_of_average_nap              461
employment_status                    0
avg_weekly_hours_worked              0
treated_for_heart_disease            0
treated_for_high_blood_pressure      0
treated_for_diabetes                 0
treated_for_heartburn_GERD           0
treated_for_arthritis                0
treated_for_depression               0
treated_for_anxiety                  0
age                                  0
marital_status                       0
highest_edu                          0
single_dual_household                0
house_hold_income_bracket            0
dtype: int64

In [8]:
#Fill null values with 0's

df.fillna(0, inplace= True)
df.isnull().sum().sum()

0

## Time Range Columns

**Map correct values for time ranges**

'awake_time','start_work','end_work','bed_time_work_tomorrow','no_work_awake_time', bed_time_no_work_tomorrow' have numerical values that correspond to time ranges. A dictionary is created to map the correct values over their corresponding integers.

In [9]:
#Map category values over coresponding numbers:

time_dict = {
1 : '12:00 AM', 
2 : '12:01 AM – 4:59 AM',
3 : '5:00 AM – 5:14 AM',
4 : '5:15 AM – 5:29 AM',
5 : '5:30 AM – 5:44 AM',
6 : '5:45 AM – 5:59 AM',
7 : '6:00 AM – 6:14 AM',
8 : '6:15 AM – 6:29 AM',
9 : '6:30 AM – 6:44 AM',
10 : '6:45 AM – 6:59 AM',
11 : '7:00 AM – 7:14 AM',
12 : '7:15 AM – 7:29 AM',
13 : '7:30 AM – 7:44 AM',
14 : '7:45 AM – 7:59 AM',
15 : '8:00 AM – 8:14 AM',
16 : '8:15 AM – 8:29 AM',
17 : '8:30 AM – 8:44 AM',
18 : '8:45 AM – 8:59 AM',
19 : '9:00 AM – 9:14 AM',
20 : '9:15 AM – 9:29 AM',
21 : '9:30 AM – 9:44 AM',
22 : '9:45 AM – 9:59 AM',
23 : '10:00 AM – 10:59 AM',
24 : '11:00 AM – 11:59 AM',
25 : '12:00 PM – 5:59 PM',
26 : '6:00 PM – 11:59 PM',
98 : 'NA',
99 : 'NA',
}


for i in ['awake_time','start_work','end_work',
          'bed_time_work_tomorrow','no_work_awake_time',
          'bed_time_no_work_tomorrow']:
    df[i] = df[i].map(time_dict)

#https://stackoverflow.com/questions/20250771/remap-values-in-pandas-column-with-a-dict

**Time range column non-answers**

For each of these columns, there are a few 98s and 99s (refused/don't know). The mode is used instead.

In [10]:
# Replace non-answers with mode of columns

for i in ['awake_time','start_work','end_work',
          'bed_time_work_tomorrow','no_work_awake_time',
          'bed_time_no_work_tomorrow']:
    df[i] = [df[i].mode()[0] if x == 'NA' else x for x in df[i]]

**Create column of convential awake times**

All awake_time time ranges outside of 5:00AM - 10:30AM are considered unconventional.  

In [11]:
df['unconvential_awake_time'] = [0 if x in ['6:00 AM – 6:14 AM',
                                            '5:00 AM – 5:14 AM',
                                            '5:30 AM – 5:44 AM',
                                            '6:30 AM – 6:44 AM',
                                            '7:00 AM – 7:14 AM',
                                            '8:00 AM – 8:14 AM',
                                            '6:15 AM – 6:29 AM',
                                            '7:30 AM – 7:44 AM',
                                            '5:45 AM – 5:59 AM',
                                            '6:45 AM – 6:59 AM',
                                            '5:15 AM – 5:29 AM',
                                            '9:00 AM – 9:14 AM',
                                            '10:00 AM – 10:59 AM',
                                            '8:30 AM – 8:44 AM',
                                            '7:45 AM – 7:59 AM',
                                            '8:15 AM – 8:29 AM',
                                            '8:45 AM – 8:59 AM',
                                            '9:30 AM – 9:44 AM'
                                           ] else 1 for x in df['awake_time']]

## Health Conditions

**Binarize 'treated' columns and select impute strategy**

Health conditions are currently binary 1 & 2 but should be 0 & 1. Non-answers (98s/99s) are given the mode.

In [12]:
#Binarize 'treated_for' columns. Refused or unknown values are given the mode, '0'.

for i in ['treated_for_heart_disease', 'treated_for_high_blood_pressure', 'treated_for_diabetes', 
         'treated_for_heartburn_GERD','treated_for_arthritis', 'treated_for_depression',
          'treated_for_anxiety']:
    df[i] = df[i].apply(lambda x : 1 if x == 1 else 0)


## Other Categorical columns

**Marital**

In [13]:
# Mapping actual values

marital_dict = {
    1: 'Married_partnered',
    2: 'Single',
    3: 'Living with someone',
    4: 'Divorced',
    5: 'Separated', 
    6: 'Widowed',
    98:'N/A'
}

df['marital_status'] = df['marital_status'].map(marital_dict)

**Highest level of education**

In [14]:
# Map actual values

edu_dict = {
    1: '8th grade or <',
    2: 'Some high school',
    3: 'Graduated high school',
    4: 'Vocational/Tech school',
    5: 'Some college',
    6: 'Graduated college',
    7: 'Advanced degree',
    98: 'NA',
    99: 'NA'
}

df['highest_edu'] = df['highest_edu'].map(edu_dict)

**Single/dual household:**

In [15]:
# Map actual values

household_dict = {
    1: 'Dual income ',
    2: 'Single income',
    3: 'Refused',
    98: 'NA',
    99: 'NA'
}

df['single_dual_household'] = df['single_dual_household'].map(household_dict)

**Income**

In [16]:
# Map actual values

income_dict = {
    1: 'Under $15,000',
    2: '$15,000 - $25,000',
    3: '$25,001 - $35,000',
    4: '$35,001 - $50,000',
    5: '$50,001 - $75,000',
    6: 'More than $100,000',
    7: 'NA',
    98: 'NA',
    99: 'NA'
}

df['house_hold_income_bracket'] = df['house_hold_income_bracket'].map(income_dict)

**Using column mode for non-answers**

In [17]:
# Fill 'NA' values with column mode

for i in df.select_dtypes(include = 'O'):
    df[i] = [df[i].mode()[0] if 'NA' in x else x for x in df[i]]

## Other Numerical Columns

Use the mode for non-answers in 'usual_sleep_per_night', 'avg_weekly_hours_worked', and 'age'

In [18]:
# Use mode for 98/99 values

for i in ['usual_sleep_per_night', 'avg_weekly_hours_worked', 'age']:
    df[i] = [0 if x > 97 else x for x in df[i]]
    

for i in ['usual_sleep_per_night', 'avg_weekly_hours_worked', 'age']:
     df[i] = [df[df[i] != 0][i].mean() if x == 0 else x for x in df[i]]

**Naps**

Use the mean for non-answers (998s/999s)

In [19]:
df['length_of_average_nap'] = df['length_of_average_nap'].apply(lambda x: 0 if x == 998 or x == 999 else x)

## Feature Engineering

**poor_sleep**

poor_sleep indicates whether the participant averages at least 7 hours of sleep.

In [20]:
df['poor_sleep'] = [0 if x >= 7 else 1 for x in df['usual_sleep_per_night']]

**has_health_condition**

This column indicates whether the participant has one of the health conditions asked for in the questionnaire.

In [21]:
df['has_condition'] = [0 for x in df['treated_for_anxiety']]
for i in [x for x in df.columns if x[:7] == 'treated']:
    df['has_condition'] += df[i]
df['has_condition'] = [1 if x > 0 else 0 for x in df['has_condition']]
df['has_condition']

0      0
1      0
2      0
3      1
4      0
      ..
995    1
996    0
997    0
998    1
999    0
Name: has_condition, Length: 1000, dtype: int64

In [22]:
for i in df.columns:
    print(i)

awake_time
start_work
end_work
bed_time_work_tomorrow
no_work_awake_time
bed_time_no_work_tomorrow
usual_sleep_per_night
naps_per_month
length_of_average_nap
employment_status
avg_weekly_hours_worked
treated_for_heart_disease
treated_for_high_blood_pressure
treated_for_diabetes
treated_for_heartburn_GERD
treated_for_arthritis
treated_for_depression
treated_for_anxiety
age
marital_status
highest_edu
single_dual_household
house_hold_income_bracket
unconvential_awake_time
poor_sleep
has_condition


## Export

**Export EDA dataframe**

In [23]:
df.to_csv('./datasets/sleep_EDA.csv')

**Export modeling dataset with dummied columns**

In [24]:
dums = df.select_dtypes('O')
for i in dums:
    d = pd.get_dummies(df[i],
               prefix = i,
               prefix_sep = '_',
               drop_first = True)
    df = pd.concat([df,d], axis = 1)
df.shape

(1000, 177)

In [25]:
df.to_csv('./datasets/sleep_Modeling.csv')

In [26]:
for i in df.columns:
    print(i)

awake_time
start_work
end_work
bed_time_work_tomorrow
no_work_awake_time
bed_time_no_work_tomorrow
usual_sleep_per_night
naps_per_month
length_of_average_nap
employment_status
avg_weekly_hours_worked
treated_for_heart_disease
treated_for_high_blood_pressure
treated_for_diabetes
treated_for_heartburn_GERD
treated_for_arthritis
treated_for_depression
treated_for_anxiety
age
marital_status
highest_edu
single_dual_household
house_hold_income_bracket
unconvential_awake_time
poor_sleep
has_condition
awake_time_11:00 AM – 11:59 AM
awake_time_12:00 AM
awake_time_12:00 PM – 5:59 PM
awake_time_12:01 AM – 4:59 AM
awake_time_5:00 AM – 5:14 AM
awake_time_5:15 AM – 5:29 AM
awake_time_5:30 AM – 5:44 AM
awake_time_5:45 AM – 5:59 AM
awake_time_6:00 AM – 6:14 AM
awake_time_6:00 PM – 11:59 PM
awake_time_6:15 AM – 6:29 AM
awake_time_6:30 AM – 6:44 AM
awake_time_6:45 AM – 6:59 AM
awake_time_7:00 AM – 7:14 AM
awake_time_7:15 AM – 7:29 AM
awake_time_7:30 AM – 7:44 AM
awake_time_7:45 AM – 7:59 AM
awake_time_8