## Data-X Mindful Project 
### Part 1. Data cleaning and featurization

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

### Read data 

In [2]:
data = pd.read_csv("MindFul Questionnaire.csv")
data.head()

Unnamed: 0,Timestamp,Username,Are you a student?,Do you have an occupation?,Do you do volunteering?,How many hours a week do you work/study?,About how many hours did you sleep last night?,Sleep Time,Wake Time,Sleep Problems,...,Did you feel depressed today?,Did you feel anxious today?,Did you feel stressed today?,Any feedback for us?,Email,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
0,2018/10/05 11:33:15 AM PDT,daveliu@berkeley.edu,Yes,No,No,,7,Slept late,Woke up early,Sleep interrupted (e.g. woke up in middle),...,,,,,Yes,,Yes,No,Yes,
1,2018/10/05 11:56:03 AM PDT,,Yes,Yes,No,10.0,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),...,No,Yes,Yes,,daveliu@berkeley.edu,This is not necessary. Delete this.,,,,Yes
2,2018/10/05 12:02:26 PM PDT,,Yes,Yes,Yes,70.0,7,Slept late,Woke up on time,Decline to answer,...,No,No,Yes,,sunliwen@berkeley.edu,,,,,Yes
3,2018/10/05 6:34:07 PM PDT,,Yes,Yes,Yes,15.0,7,Slept on time,Woke up on time,Feel tired immediately after getting up,...,No,No,Yes,"Was the answer to the ""Interpersonal"" question...",tperumpail@berkeley.edu,,,,,
4,2018/10/05 7:18:41 PM PDT,,Yes,Yes,Yes,60.0,9,Slept on time,Woke up on time,Feel tired immediately after getting up,...,Not sure,Yes,Yes,muscle soreness is not directly correlated wit...,tnallen@clemson.edu,,,,,


### Drop unuseful columns and records

In [3]:
data = data.drop(['Timestamp', 'Any feedback for us?', 'If yes, what kind of exercise?', 'Email'], axis=1)
data = data.drop([0,14, 38, 51, 57])
data.head()

Unnamed: 0,Username,Are you a student?,Do you have an occupation?,Do you do volunteering?,How many hours a week do you work/study?,About how many hours did you sleep last night?,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,...,In a relationship?,Recently been in a relationship?,Did you feel depressed today?,Did you feel anxious today?,Did you feel stressed today?,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40
1,,Yes,Yes,No,10.0,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,...,No,No,No,Yes,Yes,This is not necessary. Delete this.,,,,Yes
2,,Yes,Yes,Yes,70.0,7,Slept late,Woke up on time,Decline to answer,Good,...,Yes,Yes,No,No,Yes,,,,,Yes
3,,Yes,Yes,Yes,15.0,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),...,Decline to answer,Decline to answer,No,No,Yes,,,,,
4,,Yes,Yes,Yes,60.0,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),...,No,No,Not sure,Yes,Yes,,,,,
5,,Yes,No,No,0.0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),...,No,No,No,Yes,Yes,,,,,


### Get feature columns

In [4]:
# GET FEATURES X FROM THE DATA
X=data.iloc[:,1:-8]
X.head()

Unnamed: 0,Are you a student?,Do you have an occupation?,Do you do volunteering?,How many hours a week do you work/study?,About how many hours did you sleep last night?,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,How many times today did you have a meal?,...,"Which of these did you feel today, before taking this survey?","Which of these did you feel today, before taking this survey?.1",Did you have a mood swing today?,"On a scale of 1-10, how was your mood overall?",About how many of your friends did you see in-person over the past day?,Do you play a sport or have a regular hobby?,Are you close to your family?,"In the past day, have you...",In a relationship?,Recently been in a relationship?
1,Yes,Yes,No,10.0,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,...,Excited;Productive;Focused;Positive,Anxious;Tired;Stressed;I also felt hungry,No,8,,,Yes,Had a conversation of at least 10 minutes,No,No
2,Yes,Yes,Yes,70.0,7,Slept late,Woke up on time,Decline to answer,Good,4,...,Energized;Confident;Thankful;Relaxed;Excited;P...,Stressed,No,10,10.0,,Yes,Had a conversation of at least 10 minutes;Talk...,Yes,Yes
3,Yes,Yes,Yes,15.0,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,...,Productive;Focused;Peaceful;Positive,Tired;Upset,,7,3.0,,Yes,Had a conversation of at least 10 minutes,Decline to answer,Decline to answer
4,Yes,Yes,Yes,60.0,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,...,None of the above,Negative;Frustrated;Tired;Stressed;Angry;Lonely,No,3,3.0,No,No,Had a conversation of at least 10 minutes with...,No,No
5,Yes,No,No,0.0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),3,...,Thankful;Peaceful;Hopeful;Positive,Anxious;Stressed;Distracted,No,5,4.0,Yes,Yes,Had a conversation of at least 10 minutes with...,No,No


### Get label 

In [5]:
Y=pd.DataFrame(data.iloc[:,-8]).rename(index=str, columns={'Did you feel depressed today?': 'Depressed'})
Y.head()

Unnamed: 0,Depressed
1,No
2,No
3,No
4,Not sure
5,No


In [6]:
Y.Depressed = Y.Depressed.replace( {'Yes': 1, 'No': 0, 'Not sure': 1} )
Y = Y.reset_index().drop('index', axis=1)

In [7]:
Y.head()

Unnamed: 0,Depressed
0,0
1,0
2,0
3,1
4,0


### Featurization

In [8]:
# Rename
X.rename(columns={'Are you a student?': 'Student', 'Do you have an occupation?': 'Occupation', 'Do you do volunteering?': 'Volunteering', 'Did you exercise today?': 'Exercise', 'How many hours a week do you work/study?': 'Hour_Study', 'About how many hours did you sleep last night?': 'Hour_Sleep', 'How many times today did you have a meal?': 'Meal', 'How many times today did you eat a snack?': 'Snack', 'How many minutes did you spend exercising?': 'Exercise_Time', 'Did you laugh today?': 'Laugh', 'Did you have a mood swing today?': 'Mood_Swing', 'On a scale of 1-10, how was your mood overall?':'Mood', 'About how many of your friends did you see in-person over the past day?': 'Friends', 'Do you play a sport or have a regular hobby?': 'Hobby', 'Are you close to your family?': 'Family', 'Recently been in a relationship?': 'Previous_Relationship'}, inplace=True)
X = X.reset_index(drop=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,Meal,...,"Which of these did you feel today, before taking this survey?","Which of these did you feel today, before taking this survey?.1",Mood_Swing,Mood,Friends,Hobby,Family,"In the past day, have you...",In a relationship?,Previous_Relationship
0,Yes,Yes,No,10.0,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,...,Excited;Productive;Focused;Positive,Anxious;Tired;Stressed;I also felt hungry,No,8,,,Yes,Had a conversation of at least 10 minutes,No,No
1,Yes,Yes,Yes,70.0,7,Slept late,Woke up on time,Decline to answer,Good,4,...,Energized;Confident;Thankful;Relaxed;Excited;P...,Stressed,No,10,10.0,,Yes,Had a conversation of at least 10 minutes;Talk...,Yes,Yes
2,Yes,Yes,Yes,15.0,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,...,Productive;Focused;Peaceful;Positive,Tired;Upset,,7,3.0,,Yes,Had a conversation of at least 10 minutes,Decline to answer,Decline to answer
3,Yes,Yes,Yes,60.0,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,...,None of the above,Negative;Frustrated;Tired;Stressed;Angry;Lonely,No,3,3.0,No,No,Had a conversation of at least 10 minutes with...,No,No
4,Yes,No,No,0.0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),3,...,Thankful;Peaceful;Hopeful;Positive,Anxious;Stressed;Distracted,No,5,4.0,Yes,Yes,Had a conversation of at least 10 minutes with...,No,No


In [9]:
# yes/no questions -- convert to binary
for col in ['Student', 'Occupation', 'Volunteering', 'Exercise', 'Laugh', 'Mood_Swing', 'Hobby', 'Family', 'Previous_Relationship']:
    X[col] = X[col].replace( {'Yes': 1, 'No': 0, 'Decline to answer': -1, 'Decline to Answer': -1} )
    X.loc[X[col].isnull(),col] = -1
    mostcommon = int(round(X[col].mean(),0))
    X[col] = X[col].replace({-1: mostcommon}).astype(int)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,Meal,...,"Which of these did you feel today, before taking this survey?","Which of these did you feel today, before taking this survey?.1",Mood_Swing,Mood,Friends,Hobby,Family,"In the past day, have you...",In a relationship?,Previous_Relationship
0,1,1,0,10.0,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,...,Excited;Productive;Focused;Positive,Anxious;Tired;Stressed;I also felt hungry,0,8,,0,1,Had a conversation of at least 10 minutes,No,0
1,1,1,1,70.0,7,Slept late,Woke up on time,Decline to answer,Good,4,...,Energized;Confident;Thankful;Relaxed;Excited;P...,Stressed,0,10,10.0,0,1,Had a conversation of at least 10 minutes;Talk...,Yes,1
2,1,1,1,15.0,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,...,Productive;Focused;Peaceful;Positive,Tired;Upset,0,7,3.0,0,1,Had a conversation of at least 10 minutes,Decline to answer,0
3,1,1,1,60.0,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,...,None of the above,Negative;Frustrated;Tired;Stressed;Angry;Lonely,0,3,3.0,0,0,Had a conversation of at least 10 minutes with...,No,0
4,1,0,0,0.0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),3,...,Thankful;Peaceful;Hopeful;Positive,Anxious;Stressed;Distracted,0,5,4.0,1,1,Had a conversation of at least 10 minutes with...,No,0


In [10]:
# Hours Study
X.loc[X.Hour_Study.isnull(),'Hour_Study'] = X['Hour_Study'].dropna().mean()
X['Hour_Study'] = pd.cut(X['Hour_Study'], [-1,0,9,19,29,39,49,59,69,79,89], labels=[0,5,15,25,35,45,55,65,75,85])
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,Meal,...,"Which of these did you feel today, before taking this survey?","Which of these did you feel today, before taking this survey?.1",Mood_Swing,Mood,Friends,Hobby,Family,"In the past day, have you...",In a relationship?,Previous_Relationship
0,1,1,0,15,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,...,Excited;Productive;Focused;Positive,Anxious;Tired;Stressed;I also felt hungry,0,8,,0,1,Had a conversation of at least 10 minutes,No,0
1,1,1,1,75,7,Slept late,Woke up on time,Decline to answer,Good,4,...,Energized;Confident;Thankful;Relaxed;Excited;P...,Stressed,0,10,10.0,0,1,Had a conversation of at least 10 minutes;Talk...,Yes,1
2,1,1,1,15,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,...,Productive;Focused;Peaceful;Positive,Tired;Upset,0,7,3.0,0,1,Had a conversation of at least 10 minutes,Decline to answer,0
3,1,1,1,65,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,...,None of the above,Negative;Frustrated;Tired;Stressed;Angry;Lonely,0,3,3.0,0,0,Had a conversation of at least 10 minutes with...,No,0
4,1,0,0,0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),3,...,Thankful;Peaceful;Hopeful;Positive,Anxious;Stressed;Distracted,0,5,4.0,1,1,Had a conversation of at least 10 minutes with...,No,0


In [11]:
# Hours Sleep
X['Hour_Sleep'].value_counts()
X['Hour_Sleep'] = X['Hour_Sleep'].replace(['Less than 5', '10 or more'], [5, 10])
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Sleep Time,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,Meal,...,"Which of these did you feel today, before taking this survey?","Which of these did you feel today, before taking this survey?.1",Mood_Swing,Mood,Friends,Hobby,Family,"In the past day, have you...",In a relationship?,Previous_Relationship
0,1,1,0,15,7,Slept late,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,...,Excited;Productive;Focused;Positive,Anxious;Tired;Stressed;I also felt hungry,0,8,,0,1,Had a conversation of at least 10 minutes,No,0
1,1,1,1,75,7,Slept late,Woke up on time,Decline to answer,Good,4,...,Energized;Confident;Thankful;Relaxed;Excited;P...,Stressed,0,10,10.0,0,1,Had a conversation of at least 10 minutes;Talk...,Yes,1
2,1,1,1,15,7,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,...,Productive;Focused;Peaceful;Positive,Tired;Upset,0,7,3.0,0,1,Had a conversation of at least 10 minutes,Decline to answer,0
3,1,1,1,65,9,Slept on time,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,...,None of the above,Negative;Frustrated;Tired;Stressed;Angry;Lonely,0,3,3.0,0,0,Had a conversation of at least 10 minutes with...,No,0
4,1,0,0,0,8,Slept on time,Woke up on time,None of the above,Neither good nor bad (Average),3,...,Thankful;Peaceful;Hopeful;Positive,Anxious;Stressed;Distracted,0,5,4.0,1,1,Had a conversation of at least 10 minutes with...,No,0


In [12]:
# Sleep Time
binary_encoded = pd.get_dummies(X['Sleep Time'])
newcols = binary_encoded.columns
X[newcols] = binary_encoded

X.rename(columns={'Slept late': 'Sleep_Late', 'Slept on time': 'Sleep_onTime', 'Slept early': 'Sleep_Early'}, inplace=True)
X.drop(['Sleep Time'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Wake Time,Sleep Problems,What do you feel the quality of your sleep?,Meal,Snack,...,Mood,Friends,Hobby,Family,"In the past day, have you...",In a relationship?,Previous_Relationship,Sleep_Early,Sleep_Late,Sleep_onTime
0,1,1,0,15,7,Woke up late,Sleep interrupted (e.g. woke up in middle),Bad,3,2,...,8,,0,1,Had a conversation of at least 10 minutes,No,0,0,1,0
1,1,1,1,75,7,Woke up on time,Decline to answer,Good,4,1,...,10,10.0,0,1,Had a conversation of at least 10 minutes;Talk...,Yes,1,0,1,0
2,1,1,1,15,7,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),3,1,...,7,3.0,0,1,Had a conversation of at least 10 minutes,Decline to answer,0,0,0,1
3,1,1,1,65,9,Woke up on time,Feel tired immediately after getting up,Neither good nor bad (Average),2,1,...,3,3.0,0,0,Had a conversation of at least 10 minutes with...,No,0,0,0,1
4,1,0,0,0,8,Woke up on time,None of the above,Neither good nor bad (Average),3,1,...,5,4.0,1,1,Had a conversation of at least 10 minutes with...,No,0,0,0,1


In [13]:
# Wake Time
binary_encoded = pd.get_dummies(X['Wake Time'])
newcols = binary_encoded.columns
X[newcols] = binary_encoded

X.rename(columns={'Woke up late': 'Woke_Late', 'Woke up on time': 'Woke_onTime', 'Woke up early': 'Woke_Early'}, inplace=True)
X.drop(['Wake Time'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Sleep Problems,What do you feel the quality of your sleep?,Meal,Snack,Which of these did you have today?,...,Family,"In the past day, have you...",In a relationship?,Previous_Relationship,Sleep_Early,Sleep_Late,Sleep_onTime,Woke_Early,Woke_Late,Woke_onTime
0,1,1,0,15,7,Sleep interrupted (e.g. woke up in middle),Bad,3,2,"Caffeine (e.g. coffee, tea, energy drink);Mult...",...,1,Had a conversation of at least 10 minutes,No,0,0,1,0,0,1,0
1,1,1,1,75,7,Decline to answer,Good,4,1,Veggies (At least 1 serving);Fruit (At least 1...,...,1,Had a conversation of at least 10 minutes;Talk...,Yes,1,0,1,0,0,0,1
2,1,1,1,15,7,Feel tired immediately after getting up,Neither good nor bad (Average),3,1,Veggies (At least 1 serving);Fruit (At least 1...,...,1,Had a conversation of at least 10 minutes,Decline to answer,0,0,0,1,0,0,1
3,1,1,1,65,9,Feel tired immediately after getting up,Neither good nor bad (Average),2,1,Veggies (At least 1 serving);At least two cups...,...,0,Had a conversation of at least 10 minutes with...,No,0,0,0,1,0,0,1
4,1,0,0,0,8,None of the above,Neither good nor bad (Average),3,1,Veggies (At least 1 serving);Fruit (At least 1...,...,1,Had a conversation of at least 10 minutes with...,No,0,0,0,1,0,0,1


In [14]:
# Sleep Problems
X['SP_None'] = np.zeros(len(X.Student)).astype(int)
X['SP_Interrupted'] = np.zeros(len(X.Student)).astype(int)
X['SP_Decline'] = np.zeros(len(X.Student)).astype(int)
X['SP_Tired'] = np.zeros(len(X.Student)).astype(int)
X['SP_Nap'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X['Sleep Problems']):
    if 'None of the above' in s:
        X.loc[i,'SP_None'] = 1
    if 'Sleep interrupted (e.g. woke up in middle)' in s:
        X.loc[i,'SP_Interrupted'] = 1
    if 'Decline to answer' in s:
        X.loc[i,'SP_Decline'] = 1
    if 'Feel tired immediately after getting up' in s:
        X.loc[i,'SP_Tired'] = 1
    if 'Took a nap' in s:
        X.loc[i,'SP_Nap'] = 1

X.drop(['Sleep Problems'], axis=1, inplace=True)

X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,What do you feel the quality of your sleep?,Meal,Snack,Which of these did you have today?,How healthy do you think your food choices are?,...,Sleep_Late,Sleep_onTime,Woke_Early,Woke_Late,Woke_onTime,SP_None,SP_Interrupted,SP_Decline,SP_Tired,SP_Nap
0,1,1,0,15,7,Bad,3,2,"Caffeine (e.g. coffee, tea, energy drink);Mult...",Neither healthy nor unhealthy (Average),...,1,0,0,1,0,0,1,0,0,0
1,1,1,1,75,7,Good,4,1,Veggies (At least 1 serving);Fruit (At least 1...,Healthy,...,1,0,0,0,1,0,0,1,0,0
2,1,1,1,15,7,Neither good nor bad (Average),3,1,Veggies (At least 1 serving);Fruit (At least 1...,Neither healthy nor unhealthy (Average),...,0,1,0,0,1,0,0,0,1,0
3,1,1,1,65,9,Neither good nor bad (Average),2,1,Veggies (At least 1 serving);At least two cups...,Healthy,...,0,1,0,0,1,0,0,0,1,0
4,1,0,0,0,8,Neither good nor bad (Average),3,1,Veggies (At least 1 serving);Fruit (At least 1...,Healthy,...,0,1,0,0,1,1,0,0,0,0


In [15]:
# Sleep quality
binary_encoded = pd.get_dummies(X['What do you feel the quality of your sleep?'])
newcols = binary_encoded.columns
X[newcols] = binary_encoded

X.rename(columns={'Neither good nor bad (Average)': 'Average'}, inplace=True)
X.drop(['What do you feel the quality of your sleep?'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Which of these did you have today?,How healthy do you think your food choices are?,Exercise,...,SP_None,SP_Interrupted,SP_Decline,SP_Tired,SP_Nap,Bad,Good,Average,Really Bad,Really Good
0,1,1,0,15,7,3,2,"Caffeine (e.g. coffee, tea, energy drink);Mult...",Neither healthy nor unhealthy (Average),1,...,0,1,0,0,0,1,0,0,0,0
1,1,1,1,75,7,4,1,Veggies (At least 1 serving);Fruit (At least 1...,Healthy,1,...,0,0,1,0,0,0,1,0,0,0
2,1,1,1,15,7,3,1,Veggies (At least 1 serving);Fruit (At least 1...,Neither healthy nor unhealthy (Average),1,...,0,0,0,1,0,0,0,1,0,0
3,1,1,1,65,9,2,1,Veggies (At least 1 serving);At least two cups...,Healthy,1,...,0,0,0,1,0,0,0,1,0,0
4,1,0,0,0,8,3,1,Veggies (At least 1 serving);Fruit (At least 1...,Healthy,0,...,1,0,0,0,0,0,0,1,0,0


In [16]:
# Food Eaten Today
X['Veggies'] = np.zeros(len(X.Student)).astype(int)
X['Fruit'] = np.zeros(len(X.Student)).astype(int)
X['Water'] = np.zeros(len(X.Student)).astype(int)
X['Fried'] = np.zeros(len(X.Student)).astype(int)
X['Oil'] = np.zeros(len(X.Student)).astype(int)
X['Fast'] = np.zeros(len(X.Student)).astype(int)
X['Alcohol'] = np.zeros(len(X.Student)).astype(int)
X['Caffe'] = np.zeros(len(X.Student)).astype(int)
X['Vitamin'] = np.zeros(len(X.Student)).astype(int)
X['Sugar'] = np.zeros(len(X.Student)).astype(int)
X['Drug'] = np.zeros(len(X.Student)).astype(int)
X['F_None'] = np.zeros(len(X.Student)).astype(int)
X['F_Decline'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X['Which of these did you have today?']):
    if 'Veggies (At least 1 serving)' in s:
        X.loc[i,'Veggies'] = 1
    if 'Fruit (At least 1 piece/serving)' in s:
        X.loc[i,'Fruit'] = 1
    if 'At least two cups of water' in s:
        X.loc[i,'Water'] = 1
    if 'None of the above' in s:
        X.loc[i,'F_None'] = 1
    if 'Fried Food' in s:
        X.loc[i,'Fried'] = 1
    if 'Oily' in s:
        X.loc[i,'Oil'] = 1
    if 'Fast' in s:
        X.loc[i,'Fast'] = 1
    if 'Alcohol' in s:
        X.loc[i,'Alcohol'] = 1
    if 'Caffeine (e.g. coffee, tea, energy drink)' in s:
        X.loc[i,'Caffe'] = 1
    if 'Multivitamin or nutrition supplement' in s:
        X.loc[i,'Vitamin'] = 1
    if 'Sugary Stuff (Soda, Candy, Pastries, etc.)' in s:
        X.loc[i,'Sugar'] = 1
    if 'Prescription Meds' in s:
        X.loc[i,'Drug'] = 1
    if 'Decline to answer' in s:
        X.loc[i,'F_Decline'] = 1

X.drop(['Which of these did you have today?'], axis=1, inplace=True)

X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,How healthy do you think your food choices are?,Exercise,Exercise quality,...,Fried,Oil,Fast,Alcohol,Caffe,Vitamin,Sugar,Drug,F_None,F_Decline
0,1,1,0,15,7,3,2,Neither healthy nor unhealthy (Average),1,Feel sore from exercise?,...,0,0,0,0,1,1,1,0,0,0
1,1,1,1,75,7,4,1,Healthy,1,Sweat during exercise?;Feel sore from exercise?,...,1,0,1,0,1,0,0,0,0,0
2,1,1,1,15,7,3,1,Neither healthy nor unhealthy (Average),1,Sweat during exercise?;Feel sore from exercise?,...,0,0,0,0,0,0,1,0,0,0
3,1,1,1,65,9,2,1,Healthy,1,Sweat during exercise?;Feel sore from exercise?,...,1,1,0,1,0,0,0,1,0,0
4,1,0,0,0,8,3,1,Healthy,0,None of the above,...,0,1,0,0,0,1,0,0,0,0


In [17]:
# Food Health Choices
binary_encoded = pd.get_dummies(X['How healthy do you think your food choices are?'])
newcols = binary_encoded.columns
X[newcols] = binary_encoded

X.rename(columns={'Neither healthy nor unhealthy (Average)': 'F_Average'}, inplace=True)
X.drop(['How healthy do you think your food choices are?'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise quality,Exercise_Time,...,Caffe,Vitamin,Sugar,Drug,F_None,F_Decline,Healthy,F_Average,Really Healthy,Unhealthy
0,1,1,0,15,7,3,2,1,Feel sore from exercise?,30,...,1,1,1,0,0,0,0,1,0,0
1,1,1,1,75,7,4,1,1,Sweat during exercise?;Feel sore from exercise?,60,...,1,0,0,0,0,0,1,0,0,0
2,1,1,1,15,7,3,1,1,Sweat during exercise?;Feel sore from exercise?,55,...,0,0,1,0,0,0,0,1,0,0
3,1,1,1,65,9,2,1,1,Sweat during exercise?;Feel sore from exercise?,45,...,0,0,0,1,0,0,1,0,0,0
4,1,0,0,0,8,3,1,0,None of the above,0,...,0,1,0,0,0,0,1,0,0,0


In [18]:
# Exercise Quality
X['Sweat'] = np.zeros(len(X.Student)).astype(int)
X['Sore'] = np.zeros(len(X.Student)).astype(int)
X['E_None'] = np.zeros(len(X.Student)).astype(int)
X['E_Decline'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X['Exercise quality']):
    if 'None of the above' in s:
        X.loc[i,'E_None'] = 1
    if "Sweat during exercise?" in s:
        X.loc[i,'Sweat'] = 1
    if (pd.isnull(s)) or ("Decline" in s):
        X.loc[i,'E_Decline'] = 1
    if 'Feel sore from exercise?' in s:
        X.loc[i,'Sore'] = 1

X.drop(['Exercise quality'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Please check the box if you experienced it today,...,F_None,F_Decline,Healthy,F_Average,Really Healthy,Unhealthy,Sweat,Sore,E_None,E_Decline
0,1,1,0,15,7,3,2,1,30,Muscle Cramps or Soreness,...,0,0,0,1,0,0,0,1,0,0
1,1,1,1,75,7,4,1,1,60,Dry Eyes;None of the above,...,0,0,1,0,0,0,1,1,0,0
2,1,1,1,15,7,3,1,1,55,Acne;Headaches,...,0,0,0,1,0,0,1,1,0,0
3,1,1,1,65,9,2,1,1,45,Dry Skin;Dry Mouth;Dry Eyes;Acne;Muscle Cramps...,...,0,0,1,0,0,0,1,1,0,0
4,1,0,0,0,8,3,1,0,0,None of the above,...,0,0,1,0,0,0,0,0,1,0


In [19]:
# Minutes of Exercise
X.loc[X.Exercise_Time.isnull(),'Exercise_Time'] = X['Exercise_Time'].dropna().astype(int).mean()
X['Exercise_Time'] = pd.cut(X['Exercise_Time'].astype(int), [-1,0,15,30,45,60,75,90,180], labels=[0,15,30,45,60,75,90,120])
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Please check the box if you experienced it today,...,F_None,F_Decline,Healthy,F_Average,Really Healthy,Unhealthy,Sweat,Sore,E_None,E_Decline
0,1,1,0,15,7,3,2,1,30,Muscle Cramps or Soreness,...,0,0,0,1,0,0,0,1,0,0
1,1,1,1,75,7,4,1,1,60,Dry Eyes;None of the above,...,0,0,1,0,0,0,1,1,0,0
2,1,1,1,15,7,3,1,1,60,Acne;Headaches,...,0,0,0,1,0,0,1,1,0,0
3,1,1,1,65,9,2,1,1,45,Dry Skin;Dry Mouth;Dry Eyes;Acne;Muscle Cramps...,...,0,0,1,0,0,0,1,1,0,0
4,1,0,0,0,8,3,1,0,0,None of the above,...,0,0,1,0,0,0,0,0,1,0


In [20]:
# Body Physical Experience Today
X['Dry_skin'] = np.zeros(len(X.Student)).astype(int)
X['Dry_mouth'] = np.zeros(len(X.Student)).astype(int)
X['Dry_eyes'] = np.zeros(len(X.Student)).astype(int)
X['Acne'] = np.zeros(len(X.Student)).astype(int)
X['Muscle'] = np.zeros(len(X.Student)).astype(int)
X['Headache'] = np.zeros(len(X.Student)).astype(int)
X['T_None'] = np.zeros(len(X.Student)).astype(int)
X['T_Decline'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X['Please check the box if you experienced it today']):
    if (pd.isnull(i)) or ("Decline" in s):
        X.loc[i,'T_Decline'] = 1
    if "Dry Skin" in s:
        X.loc[i,'Dry_skin'] = 1
    if "Dry Mouth" in s:
        X.loc[i,'Dry_mouth'] = 1
    if "Dry Eyes" in s:
        X.loc[i,'Dry_eyes'] = 1
    if "Acne" in s:
        X.loc[i,'Acne'] = 1
    if "Muscle Cramps" in s:
        X.loc[i,'Muscle'] = 1
    if "Headaches" in s:
        X.loc[i,'Headache'] = 1
    if "None" in s:
        X.loc[i,'T_None'] = 1

X.drop(['Please check the box if you experienced it today'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,E_None,E_Decline,Dry_skin,Dry_mouth,Dry_eyes,Acne,Muscle,Headache,T_None,T_Decline
0,1,1,0,15,7,3,2,1,30,1,...,0,0,0,0,0,0,1,0,0,0
1,1,1,1,75,7,4,1,1,60,1,...,0,0,0,0,1,0,0,0,1,0
2,1,1,1,15,7,3,1,1,60,1,...,0,0,0,0,0,1,0,1,0,0
3,1,1,1,65,9,2,1,1,45,1,...,0,0,1,1,1,1,1,0,0,0
4,1,0,0,0,8,3,1,0,0,0,...,1,0,0,0,0,0,0,0,1,0


In [21]:
# Positive Feelings
X['Energized'] = np.zeros(len(X.Student)).astype(int)
X['Confident'] = np.zeros(len(X.Student)).astype(int)
X['Thankful'] = np.zeros(len(X.Student)).astype(int)
X['Relaxed'] = np.zeros(len(X.Student)).astype(int)
X['Excited'] = np.zeros(len(X.Student)).astype(int)
X['Productive'] = np.zeros(len(X.Student)).astype(int)
X['Focused'] = np.zeros(len(X.Student)).astype(int)
X['Peaceful'] = np.zeros(len(X.Student)).astype(int)
X['Happy'] = np.zeros(len(X.Student)).astype(int)
X['Hopeful'] = np.zeros(len(X.Student)).astype(int)
X['Positive'] = np.zeros(len(X.Student)).astype(int)
X['Pos_None'] = np.zeros(len(X.Student)).astype(int)
X['Pos_Decline'] = np.zeros(len(X.Student)).astype(int)
X['Pos_Total'] = np.zeros(len(X.Student)).astype(int)


for i,s in enumerate(X["Which of these did you feel today, before taking this survey?"]):
    if (pd.isnull(s)) or ("Decline" in s):
        X.loc[i,'Pos_Decline'] = 1
    if "Energized" in s:
        X.loc[i,'Energized'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Confident" in s:
        X.loc[i,'Confident'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Thankful" in s:
        X.loc[i,'Thankful'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Relaxed" in s:
        X.loc[i,'Relaxed'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Excited" in s:
        X.loc[i,'Excited'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Productive" in s:
        X.loc[i,'Productive'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Focused" in s:
        X.loc[i,'Focused'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Peaceful" in s:
        X.loc[i,'Peaceful'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Happy" in s:
        X.loc[i,'Happy'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Hopeful" in s:
        X.loc[i,'Hopeful'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "Positive" in s:
        X.loc[i,'Positive'] = 1
        X.loc[i,'Pos_Total'] = X.loc[i,'Pos_Total'] + 1
    if "None" in s:
        X.loc[i,'Pos_None'] = 1

X.drop(['Which of these did you feel today, before taking this survey?'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,Excited,Productive,Focused,Peaceful,Happy,Hopeful,Positive,Pos_None,Pos_Decline,Pos_Total
0,1,1,0,15,7,3,2,1,30,1,...,1,1,1,0,0,0,1,0,0,4
1,1,1,1,75,7,4,1,1,60,1,...,1,1,1,1,1,1,1,0,0,11
2,1,1,1,15,7,3,1,1,60,1,...,0,1,1,1,0,0,1,0,0,4
3,1,1,1,65,9,2,1,1,45,1,...,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,8,3,1,0,0,0,...,0,0,0,1,0,1,1,0,0,4


In [22]:
# Negative Feelings
X['Negative'] = np.zeros(len(X.Student)).astype(int)
X['Hopeless'] = np.zeros(len(X.Student)).astype(int)
X['Frustrated'] = np.zeros(len(X.Student)).astype(int)
X['Anxious'] = np.zeros(len(X.Student)).astype(int)
X['Scared'] = np.zeros(len(X.Student)).astype(int)
X['Tired'] = np.zeros(len(X.Student)).astype(int)
X['Stressed'] = np.zeros(len(X.Student)).astype(int)
X['Distracted'] = np.zeros(len(X.Student)).astype(int)
X['Upset'] = np.zeros(len(X.Student)).astype(int)
X['Angry'] = np.zeros(len(X.Student)).astype(int)
X['Jealous'] = np.zeros(len(X.Student)).astype(int)
X['Lonely'] = np.zeros(len(X.Student)).astype(int)
X['Hungry'] = np.zeros(len(X.Student)).astype(int)
X['Neg_None'] = np.zeros(len(X.Student)).astype(int)
X['Neg_Decline'] = np.zeros(len(X.Student)).astype(int)
X['Neg_Total'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X["Which of these did you feel today, before taking this survey?.1"]):
    if (pd.isnull(s)) or ("Decline" in s):
        X.loc[i,'Neg_Decline'] = 1
    if "Negative" in s:
        X.loc[i,'Negative'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Hopeless" in s:
        X.loc[i,'Hopeless'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Frustrated" in s:
        X.loc[i,'Frustrated'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Anxious" in s:
        X.loc[i,'Anxious'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Scared" in s:
        X.loc[i,'Scared'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Tired" in s:
        X.loc[i,'Tired'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Stressed" in s:
        X.loc[i,'Stressed'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Distracted" in s:
        X.loc[i,'Distracted'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Upset" in s:
        X.loc[i,'Upset'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Angry" in s:
        X.loc[i,'Angry'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Jealous" in s:
        X.loc[i,'Jealous'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "Lonely" in s:
        X.loc[i,'Lonely'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if ("Hungry" in s) or ('hungry' in s):
        X.loc[i,'Hungry'] = 1
        X.loc[i,'Neg_Total'] = X.loc[i,'Neg_Total'] + 1
    if "None" in s:
        X.loc[i,'Neg_None'] = 1

X.drop(['Which of these did you feel today, before taking this survey?.1'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,Stressed,Distracted,Upset,Angry,Jealous,Lonely,Hungry,Neg_None,Neg_Decline,Neg_Total
0,1,1,0,15,7,3,2,1,30,1,...,1,0,0,0,0,0,1,0,0,4
1,1,1,1,75,7,4,1,1,60,1,...,1,0,0,0,0,0,0,0,0,1
2,1,1,1,15,7,3,1,1,60,1,...,0,0,1,0,0,0,0,0,0,2
3,1,1,1,65,9,2,1,1,45,1,...,1,0,0,1,0,1,0,0,0,6
4,1,0,0,0,8,3,1,0,0,0,...,1,1,0,0,0,0,0,0,0,3


In [23]:
# Friends
X.loc[X.Friends.isnull(),'Friends'] = X['Friends'].dropna().astype(int).mean()
X['Friends'] = pd.cut(X['Friends'].astype(int), [-1,0,1,2,3,4,5,10,30], labels=[0,1,2,3,4,5,10,20])
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,Stressed,Distracted,Upset,Angry,Jealous,Lonely,Hungry,Neg_None,Neg_Decline,Neg_Total
0,1,1,0,15,7,3,2,1,30,1,...,1,0,0,0,0,0,1,0,0,4
1,1,1,1,75,7,4,1,1,60,1,...,1,0,0,0,0,0,0,0,0,1
2,1,1,1,15,7,3,1,1,60,1,...,0,0,1,0,0,0,0,0,0,2
3,1,1,1,65,9,2,1,1,45,1,...,1,0,0,1,0,1,0,0,0,6
4,1,0,0,0,8,3,1,0,0,0,...,1,1,0,0,0,0,0,0,0,3


In [24]:
# How Many Talk to People in the Past Day
X['Talk_10'] = np.zeros(len(X.Student)).astype(int)
X['Talk_More'] = np.zeros(len(X.Student)).astype(int)
X['Talk_2F'] = np.zeros(len(X.Student)).astype(int)
X['Talk_None'] = np.zeros(len(X.Student)).astype(int)
X['Talk_Decline'] = np.zeros(len(X.Student)).astype(int)

for i,s in enumerate(X["In the past day, have you..."]):
    if (pd.isnull(s)) or ("Decline" in s):
        X.loc[i,'Talk_Decline'] = 1
    else:
        if 'Had a conversation of at least 10' in s:
            X.loc[i,'Talk_10'] = 1
        if 'Talked to someone for more' in s:
            X.loc[i,'Talk_More'] = 1
        if 'Talked to at least 2 friends' in s:
            X.loc[i,'Talk_2F'] = 1
        if 'None' in s:
            X.loc[i,'Talk_None'] = 1

X.drop(['In the past day, have you...'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,Lonely,Hungry,Neg_None,Neg_Decline,Neg_Total,Talk_10,Talk_More,Talk_2F,Talk_None,Talk_Decline
0,1,1,0,15,7,3,2,1,30,1,...,0,1,0,0,4,1,0,0,0,0
1,1,1,1,75,7,4,1,1,60,1,...,0,0,0,0,1,1,1,1,0,0
2,1,1,1,15,7,3,1,1,60,1,...,0,0,0,0,2,1,0,0,0,0
3,1,1,1,65,9,2,1,1,45,1,...,1,0,0,0,6,1,1,1,0,0
4,1,0,0,0,8,3,1,0,0,0,...,0,0,0,0,3,1,1,1,0,0


In [25]:
# Relationship
binary_encoded = pd.get_dummies(X['In a relationship?'])
newcols = binary_encoded.columns
X[newcols] = binary_encoded

X.rename(columns={"It's Complicated": 'R_Complicated', 'Decline to answer': 'R_Decline'}, inplace=True)
X.drop(['In a relationship?'], axis=1, inplace=True)
X.head()

Unnamed: 0,Student,Occupation,Volunteering,Hour_Study,Hour_Sleep,Meal,Snack,Exercise,Exercise_Time,Laugh,...,Neg_Total,Talk_10,Talk_More,Talk_2F,Talk_None,Talk_Decline,R_Decline,R_Complicated,No,Yes
0,1,1,0,15,7,3,2,1,30,1,...,4,1,0,0,0,0,0,0,1,0
1,1,1,1,75,7,4,1,1,60,1,...,1,1,1,1,0,0,0,0,0,1
2,1,1,1,15,7,3,1,1,60,1,...,2,1,0,0,0,0,1,0,0,0
3,1,1,1,65,9,2,1,1,45,1,...,6,1,1,1,0,0,0,0,1,0
4,1,0,0,0,8,3,1,0,0,0,...,3,1,1,1,0,0,0,0,1,0


### Export to csv files

In [26]:
#Save the Dataframe locally
X.to_csv("X_df.csv") # Train data of csv file
Y.to_csv("Y_df.csv") # Train data of csv file