In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import re
from re import match, search
from time import gmtime, strftime

In [2]:
# import csv dataset as pandas dataframe
odi_dataframe = pd.read_csv("../data/ODI-2018.csv", sep=',')

In [3]:
#check header
print(odi_dataframe.head())

           Timestamp               What programme are you in?  \
0                NaN                                      NaN   
1  4/5/2018 11:22:56  Duisenberg Quantitative Risk Management   
2  4/5/2018 11:23:04                         Computer Science   
3  4/5/2018 11:23:06                       Business Analytics   
4  4/5/2018 11:23:50                                       BA   

  Have you taken a course on machine learning?  \
0                                          NaN   
1                                           no   
2                                           no   
3                                          yes   
4                                          yes   

  Have you taken a course on information retrieval?  \
0                                               NaN   
1                                                 0   
2                                                 0   
3                                                 1   
4                                

In [4]:
#copy to working dataframe
odi_df = odi_dataframe.copy()
#We an irrelevant first row with NaN values: let's remove it
odi_df = odi_df.iloc[1:]

In [5]:
# Changing the headers
odi_df.rename(columns={'Time you went to be Yesterday':'bedtime', 'When is your birthday (date)?':'Birthday', 'What is your gender?':'gender', 'What programme are you in?':'program', 'Have you taken a course on machine learning?':'MLcourse', 'Have you taken a course on statistics?':'StatsCourse', 'Have you taken a course on information retrieval?':'IRcourse', 'Have you taken a course on databases?':'DBcourse'}, inplace=True)
odi_df.rename(columns={'Number of neighbors sitting around you?':'neighbors', 'Did you stand up?':'stand', 'Give a random number':'rndnum', 'What makes a good day for you (1)?':'goodDay1', 'What makes a good day for you (2)?':'goodDay2', 'Chocolate makes you.....':'choco_op', 'You can get £100 if you win a local DM competition, or we don’t hold any competitions and I give everyone some money (not the same amount!). How much do you think you would get then? ':'money'}, inplace=True)

In [6]:
print(odi_df.head())

           Timestamp                                        program MLcourse  \
1  4/5/2018 11:22:56        Duisenberg Quantitative Risk Management       no   
2  4/5/2018 11:23:04                               Computer Science       no   
3  4/5/2018 11:23:06                             Business Analytics      yes   
4  4/5/2018 11:23:50                                             BA      yes   
5  4/5/2018 11:23:59  Master Computer Science: Big Data Engineering       no   

  IRcourse StatsCourse DBcourse gender  \
1        0          mu      nee   male   
2        0     unknown       ja   male   
3        1       sigma       ja   male   
4        1          mu       ja   male   
5        0       sigma       ja   male   

                                    choco_op        Birthday neighbors stand  \
1                                    neither      10/12/1994     49000    no   
2  I have no idea what you are talking about      06-08-1993       100   yes   
3                         

In [7]:
# Function for standardizing free response features
def set_to_standard(series, regexs, std_value):
   
   changed_values = []
   indices = []
   std_series = series.copy()
   
   for index, value in std_series.iteritems():
      for regex in regexs:
         
         if index in indices: #prevents an entry to be counted twice
            continue
         m = match(regex, value) #find match for any of the reg. expressions
         
      if m is not None: # if there's a match
         changed_values.append(value)
         indices.append(index)
         
   changed_values = pd.Series((v for v in changed_values))

   print("The following " + str(len(changed_values)) + " values have been changed to: " + std_value)
   print("")
   
   print(changed_values.value_counts())
   
   std_series[indices] = std_value
   
   return std_series

Now we're going to start filtering the free response features using the above function. The first will be goodDay1.

In [8]:
# Filter for mention of weather
sun_regexs = [r"(.*[Ss]un).*|.*[Ww]eather.*|.*[Cc]loud.*|.*(?<![Tt])[Rr]ain.*|.*[Ss]now.*|.*[Ww]ind.*|[Ss]tor"]
standard_1 = set_to_standard(odi_df.goodDay1, sun_regexs, 'weather')

The following 51 values have been changed to: weather

Sun                          14
Sunshine                      6
sunshine                      2
nice weather                  2
Sunny weather                 2
Nice weather                  2
good weather                  2
sun                           2
sunny weather                 2
Good weather                  2
Sunny day                     2
No rain                       1
Sunshine, structure           1
If the sun shines             1
Sunny weather                 1
Nice weather                  1
Snow                          1
Sunny                         1
no rain                       1
Sun, music, friends, beer     1
Weather                       1
Cloudy skies                  1
Sun                           1
The Sun                       1
dtype: int64


In [9]:
# Filter for any mention of food.
food_regexs = [r".*([Ff]ood).*|.*[Bb]reakfast.*|.*[Pp]ancakes.*|.*[Dd]inner.*|.*[Cc]heese.*|.*[Pp]izza.*|.*[Cc]hocolate.*|[Mm]ac.*|[Ss]ush|.*[Ll]unch.*"]
standard_2 = set_to_standard(standard_1, food_regexs, 'food')

The following 43 values have been changed to: food

Food                      11
Good food                  5
Chocolate                  4
Nice food                  2
good food                  2
Pizza                      2
Receiving food             1
good dinner                1
Cheese                     1
Good fuckin breakfast      1
Good dinner                1
Good food                  1
Pancakes                   1
Good foodd                 1
Tasty food                 1
Plenty food                1
Pancakes for breakfast     1
Some food                  1
Free cheese                1
nice food                  1
food                       1
Mac Donalds                1
chocolate                  1
dtype: int64


In [10]:
# Filter for sex
sex_regexs = [r'.*[Ss]ex.*']
standard_3 = set_to_standard(standard_2, sex_regexs, 'sex')

The following 8 values have been changed to: sex

Sex            6
sex            1
Morning sex    1
dtype: int64


In [11]:
# Filter for coffee
coffee_regexs = [r'.*[Cc]offe.*']
standard_4 = set_to_standard(standard_3, coffee_regexs, 'coffee')

The following 12 values have been changed to: coffee

Coffee         10
Good coffee     1
coffee          1
dtype: int64


In [12]:
# Filter for sleep
sleep_regexs = [r'.*[Ss]leep.*|.*[Ss]lept.*|.*[Rr]est.*|.*[Ww]ak.*|[Nn]ot.*[Ee]arly.*|[Nn]o.*[Aa]larm.*|.*[Bb]ed.*']
standard_5 = set_to_standard(standard_4, sleep_regexs, 'sleep')

The following 28 values have been changed to: sleep

Good sleep                    6
Sleep                         5
having a good sleep           1
sleeping well                 1
No alarm in the morning       1
Wake up without A alarm       1
Sleeping in                   1
Rest                          1
Get some good sleep           1
sleeping                      1
Enough sleep                  1
Having slept well             1
Wake up time                  1
Waking up not tired           1
Good night rest               1
waking up well rested         1
sleep                         1
Not having to get up early    1
A good sleep                  1
dtype: int64


In [13]:
# sport
sport_regexs = [r'.*[Ss]port.*|.*[Ee]xerc.*|.*[Ff]ootball.*|.*[Dd]anc.*|.*[Gg]ym.*']
standard_6 = set_to_standard(standard_5, sport_regexs, 'sport')

The following 6 values have been changed to: sport

Sports              4
Playing football    1
Dance               1
dtype: int64


In [14]:
# productivity
prod_regexs = [r'.*[Pp]rod.*|[Ww]ork.*|.*[Aa]chiev.*|.*[Ss]ucce.*|.*[Ff]in.*|.*[Gg]rade.*|.*[Ss]atis|.*[Ll]ear|.*[Cc]hal.*|.*[Cc]ompl.*|[Aa]ss.*|.*[Dd]on.*']
standard_7= set_to_standard(standard_6, prod_regexs, 'productivity')

The following 14 values have been changed to: productivity

A productive day         2
Success                  1
Achievement              1
Finish my daily tasks    1
Nice grades              1
productivity             1
Achieving something      1
Getting a good grade     1
Work                     1
Succeed in dayly goal    1
Productivity             1
Productive               1
Good grades              1
dtype: int64


In [15]:
# Relaxing
relax_regexs = [r'.*[Rr]elax.*|.*[Ff]ree.*|.*[Tt]ime.*|.*[Hh]ol.*|.*[Nn]o.*(ass.*|scho.*|homew.*|lect.*|do.*|stud.*|stre.*|wor.*|obl.*|resp.*|col.*)|[Dd]o.*no.*']
standard_8 = set_to_standard(standard_7, relax_regexs, 'relaxing')

The following 13 values have been changed to: relaxing

Freedom                      2
Free time                    2
Me-Time                      1
No assigments                1
No school/work               1
No time series assignment    1
Holiday                      1
No lecture at 9              1
No responsibilities          1
No homework                  1
afternoon lectures           1
dtype: int64


In [16]:
# Social
social_regexs = [r'.*[Ff]riend.*|.*[Hh]ang.*|.*[Ff]un.*|.*[Cc]ompany.*|.*[Ll]augh|.*[Bb]eer.*|.*[Pp]eop.*']
standard_9 = set_to_standard(standard_8, social_regexs, 'social')

The following 16 values have been changed to: social

Fun                                          2
Beer                                         2
The foresight of a night out with friends    1
Friends                                      1
Good company                                 1
Friendly encounters                          1
Seeing friends                               1
Laughs                                       1
People miding their own business             1
beer                                         1
Laughing                                     1
Hanging out                                  1
Kind people around me                        1
Fun fun fun                                  1
dtype: int64


In [17]:
# Media
media_regexs = [r'.*[Nn]et.*|.*[Mm]us.*|.*[Mm]em.*|.*[Gg]am.*|.*[Mm]ov.*']
standard_10 = set_to_standard(standard_9, media_regexs, 'media')

The following 2 values have been changed to: media

Music    2
dtype: int64


In [18]:
other_regexs = [r'^(?!.*(productivity|food|coffee|weather|social|relaxing|sport|sex|sleep|media))']
standard_11 = set_to_standard(standard_10, other_regexs, 'other')

The following 24 values have been changed to: other

-                         2
Saturday                  2
got a 8.5                 1
Guest lecture by Tupac    1
Bunnies                   1
Things to do              1
A good book               1
Morning tea               1
Na                        1
Energy                    1
Not coming to VU          1
Cats                      1
1                         1
Getting my salary         1
cats                      1
Relief                    1
Temptation Island         1
Nothing                   1
Puppies                   1
Jezus man                 1
Fox news                  1
A                         1
dtype: int64


In [19]:
# Now here are the counts for the standardized values for goodDay1
standard_11.value_counts()

weather         51
food            43
sleep           28
other           24
social          16
productivity    14
relaxing        13
coffee          12
sex              8
sport            6
media            2
Name: goodDay1, dtype: int64

In [20]:
# And here is the total list
print(standard_11)

1      productivity
2             other
3              food
4             other
5             sleep
6              food
7              food
8           weather
9           weather
10         relaxing
11           coffee
12            other
13          weather
14     productivity
15             food
16             food
17            sleep
18            sleep
19            sleep
20             food
21             food
22             food
23              sex
24            other
25     productivity
26           coffee
27             food
28           coffee
29             food
30             food
           ...     
188            food
189           sleep
190         weather
191        relaxing
192            food
193    productivity
194         weather
195        relaxing
196             sex
197           sleep
198         weather
199           sleep
200           sleep
201           sport
202    productivity
203           sleep
204          coffee
205           sleep
206           other


Now we're going to do the same thing with goodDay2

In [21]:
# Filter for mention of weather
sun_regexs = [r"(.*[Ss]un).*|.*[Ww]eather.*|.*[Cc]loud.*|.*(?<![Tt])[Rr]ain.*|.*[Ss]now.*|.*[Ww]ind.*|[Ss]tor"]
stand_1 = set_to_standard(odi_df.iloc[:,15], sun_regexs, 'weather')

The following 26 values have been changed to: weather

Sun                                           4
Sunshine                                      4
sunshine                                      2
Sunday                                        2
sun                                           2
Good and sunny weather                        1
weather                                       1
Sunny day                                     1
Storm                                         1
nice weather                                  1
Good weather                                  1
No wind                                       1
Weather                                       1
Sun shining                                   1
Following 4 courses at the same time, rain    1
Nice weather                                  1
Rain                                          1
dtype: int64


In [22]:
# Try to filter for any mention of food.
food_regexs = [r".*([Ff]ood).*|.*[Bb]reakfast.*|.*[Pp]ancakes.*|.*[Dd]inner.*|.*[Cc]heese.*|.*[Pp]izza.*|.*[Cc]hocolate.*|[Mm]ac.*|[Ss]ush|.*[Ll]unch.*"]
stand_2 = set_to_standard(stand_1, food_regexs, 'food')

The following 37 values have been changed to: food

Good food         7
Food              7
Chocolate         3
Pizza             2
good food         2
Nice food         2
Good food         1
Goos food         1
Cheese            1
Good dinner       1
Sushi             1
pizza             1
Lot of foodd      1
Lunch             1
sushi             1
Decent dinner     1
Asian food        1
More chocolate    1
Breakfast         1
Food              1
dtype: int64


In [23]:
# Filter for sex
sex_regexs = [r'.*[Ss]ex.*']
stand_3 = set_to_standard(stand_2, sex_regexs, 'sex')

The following 4 values have been changed to: sex

Sex          2
sex          1
Good sex     1
dtype: int64


In [24]:
# Filter for coffee
coffee_regexs = [r'.*[Cc]offe.*']
stand_4 = set_to_standard(stand_3, coffee_regexs, 'coffee')

The following 11 values have been changed to: coffee

Coffee          5
Good coffee     2
coffee          2
good coffee     1
Coffe           1
dtype: int64


In [25]:
# Filter for sleep
sleep_regexs = [r'.*[Ss]leep.*|.*[Ss]lept.*|.*[Rr]est.*|.*[Ww]ak.*|[Nn]ot.*[Ee]arly.*|[Nn]o.*[Aa]larm.*|.*[Bb]ed.*']
stand_5 = set_to_standard(stand_4, sleep_regexs, 'sleep')

The following 21 values have been changed to: sleep

Good sleep                               4
Sleep                                    3
Having the house quiet ehrn i wake up    1
get fast asleep quickly                  1
Wake-up time                             1
sleep                                    1
enough sleep                             1
Well rested and structured               1
No alarm                                 1
Plenty sleep                             1
A good nice sleep                        1
Good night sleep                         1
sleeping in                              1
Sleep well                               1
Getting to bed early                     1
A good sleep                             1
dtype: int64


In [26]:
# sport
sport_regexs = [r'.*[Ss]port.*|.*[Ee]xerc.*|.*[Ff]ootball.*|.*[Dd]anc.*|.*[Gg]ym.*']
stand_6 = set_to_standard(stand_5, sport_regexs, 'sport')

The following 10 values have been changed to: sport

Sports              3
Exercise            2
Going to the gym    2
Football            1
Sport               1
Go to the gym       1
dtype: int64


In [27]:
# productivity
prod_regexs = [r'.*[Pp]rod.*|[Ww]ork.*|.*[Aa]chiev.*|.*[Ss]ucce.*|.*[Ff]in.*|.*[Gg]rade.*|.*[Ss]atis|.*[Ll]ear|.*[Cc]hal.*|.*[Cc]ompl.*|[Aa]ss.*|.*[Dd]on.*']
stand_7= set_to_standard(stand_6, prod_regexs, 'productivity')

The following 13 values have been changed to: productivity

Being productive            1
Productive                  1
Learning something          1
Get satisfaction at work    1
Completing my task          1
All grades                  1
assignmeny done             1
Achievement                 1
Productivity                1
Finishing some work         1
Challenge                   1
Good grades                 1
Getting work done           1
dtype: int64


In [28]:
# Relaxing
relax_regexs = [r'.*[Rr]elax.*|.*[Ff]ree.*|.*[Tt]ime.*|.*[Hh]ol.*|.*[Nn]o.*(ass.*|scho.*|homew.*|lect.*|do.*|stud.*|stre.*|wor.*|obl.*|resp.*|col.*)|[Dd]o.*no.*']
stand_8 = set_to_standard(stand_7, relax_regexs, 'relaxing')

The following 32 values have been changed to: relaxing

No stress                    3
Free time                    2
No lectures                  2
Relaxation                   2
Relax                        2
Freedom                      2
free time                    1
no work appointments         1
Doing nothing                1
Not too much to do           1
More free time               1
Not studying                 1
Not having Anything to do    1
some free time               1
Bike not braking down        1
fun time                     1
free day                     1
No school                    1
Noting to do                 1
Have nothing to do           1
Free time                    1
No worries                   1
No school                    1
no obligations               1
No college                   1
dtype: int64


In [29]:
# Social
social_regexs = [r'.*[Ff]riend.*|.*[Hh]ang.*|.*[Ff]un.*|.*[Cc]ompany.*|.*[Ll]augh|.*[Bb]eer.*|.*[Pp]eop.*']
stand_9 = set_to_standard(stand_8, social_regexs, 'social')

The following 22 values have been changed to: social

Beer                                                            4
Friends                                                         4
A day with company                                              2
beer                                                            2
Meeting with friends                                            1
laughter                                                        1
The taste of that first sip of cold beer on a hot summer day    1
Lovely people                                                   1
Nice people                                                     1
Being with my friends                                           1
friends                                                         1
Be around nice people                                           1
Meeting awesome people                                          1
a lot of fun                                                    1
dtype: int64


In [30]:
# Media
media_regexs = [r'.*[Nn]et.*|.*[Mm]us.*|.*[Mm]em.*|.*[Gg]am.*|.*[Mm]ov.*']
stand_10 = set_to_standard(stand_9, media_regexs, 'media')

The following 8 values have been changed to: media

Netflix          2
Music            2
Video games      1
Watch a movie    1
Gaming           1
Memes            1
dtype: int64


In [31]:
other_regexs = [r'^(?!.*(productivity|food|coffee|weather|social|relaxing|sport|sex|sleep|media))']
stand_11 = set_to_standard(stand_10, other_regexs, 'other')

The following 33 values have been changed to: other

-                                   2
Money                               2
Drinks                              1
weekend                             1
B                                   1
got 2 8.5s                          1
Seeing you *kiss*                   1
new experiences                     1
0 heart attacks                     1
no pain                             1
Na                                  1
Nothing, I hate life                1
Flying                              1
A good day                          1
Te diep                             1
If I got up with the correct leg    1
Wine                                1
Activities                          1
A blunt                             1
Smiles                              1
Staying up late                     1
Good news                           1
Drugs                               1
.                                   1
Winning the lotery                 

In [32]:
stand_11.head()

1     sleep
2     other
3     sport
4     other
5    social
Name: goodDay2, dtype: object

Put the two together to create a new dataframe!

In [33]:
# Start a new dataframe to store the stadardized values
standard = pd.DataFrame(data=standard_11)
standard['goodDay2'] = stand_11.values
print(standard.head())

       goodDay1 goodDay2
1  productivity    sleep
2         other    other
3          food    sport
4         other    other
5         sleep   social


Now we're going to standardize the program feature

In [34]:
ai_regexs = [r"(Msc)?.*A(rtificial)?.?( )?[Ii].*(?<!premaster)$"] #WARNING premaster to remove
standard_1 = set_to_standard(odi_df.iloc[:,1], ai_regexs, 'AI')

The following 50 values have been changed to: AI

AI                                                    37
Ai                                                     2
Artificial Intelligence                                2
MSc AI and MSc CLS                                     1
Artificial intelligence                                1
Msc Artificial intelligence                            1
Artificial Intelligence (Socially Aware Computing)     1
AI (VU version)                                        1
A. I.                                                  1
AI (Cognitive Sciences)                                1
MSc Artificial Intelligence                            1
AI VU                                                  1
dtype: int64


In [35]:
bio_regexs = [r".*[Bb]io"]   # Bioinformatics
standard_2 = set_to_standard(standard_1, bio_regexs, 'BI')

The following 28 values have been changed to: BI

Bioinformatics                              3
Bioinformatics and Systems Biology          3
MSc Bioinformatics and Systems Biology      2
Bioinformatics and systems biology          2
Bioinformatics                              2
Bioinformatics and Systems Biology          2
Msc. Bioinformatics and Systems Biology     1
MSc Bioinformatics                          1
Bioinformatics and Systems biology          1
Master Bionformatics and Systems Biology    1
Bioinformatics master                       1
MSC Bioinformatics                          1
Bioinformatics & Systems Biology            1
Bioinformatcis                              1
Bioinformatics and SysBio                   1
MA Bioinformatics                           1
System Biology and Bioinformatics           1
Bioinformatics & Systems biology            1
bioinformatics                              1
Bioinformatics & System Biology             1
dtype: int64


In [36]:
ba_regexs = [".*[Bb](usiness )?[Aa](nalytics)?"]
standard_3 = set_to_standard(standard_2, ba_regexs, 'BA')

The following 44 values have been changed to: BA

BA                                         26
Business Analytics                          9
Business Analytics                          3
Business analytics                          2
Business Analytics Msc                      1
business analytics                          1
Master Business Analytics                   1
Business Analytics/ operations research     1
dtype: int64


In [37]:
bde_regexs = [r".*[Ee]ngineering"] #this is: Big Data Engineering
standard_4 = set_to_standard(standard_3, bde_regexs, 'BDE')

The following 7 values have been changed to: BDE

Big data engineering                              2
Computer Science: Big Data Engineering            2
Master Computer Science: Big Data Engineering     1
Masters Computer Science(big data engineering)    1
Big Data Engineering                              1
dtype: int64


In [38]:
cls_regexs = [".*[cC]om(o)?p.*ational.*ience.*|[Cc][Ll][Ss]"]
standard_5 = set_to_standard(standard_4, cls_regexs, 'CLS') 

The following 22 values have been changed to: CLS

Computational Science                       12
CLS                                          2
Computational science                        2
Msc Computational science                    1
Comoputational science                       1
Cls                                          1
MSc Computational Science (Joint Degree)     1
Computational Science                        1
Computational Science (JD)                   1
dtype: int64


In [39]:
cs_regexs = [r".{0,4}(Computer Science.?|cs|CS)$"] # ^(?!.*(metrics))
standard_6 = set_to_standard(standard_5, cs_regexs, 'CS')

The following 14 values have been changed to: CS

CS                       9
Computer Science         3
cs                       1
MSc Computer Science     1
dtype: int64


In [40]:
ec_regexs = ["^.*(EOR|[Ee]conom(e)?trics|OR|Economics).*$"]
standard_7 = set_to_standard(standard_6, ec_regexs, 'EC')

The following 22 values have been changed to: EC

Econometrics                                   8
EOR                                            4
Econometrics                                   2
OR                                             2
Master Econometrics and operations research    1
Economics                                      1
M Financial Economtrics                        1
Master Econometrics & Operations Research      1
MSc Econometrics                               1
Econometrics & Operations Research             1
dtype: int64


In [41]:
qrm_regexs = [".*([Qq]uantit.*g[ea]ment|QRM)$"]
standard_8 = set_to_standard(standard_7, qrm_regexs, 'QRM')

The following 10 values have been changed to: QRM

QRM                                                       3
Quantitative Risk Management                              3
Finance DHP QRM                                           1
Duisenberg Quantitative Risk Management                   1
Duisenberg Honors Program Quantitative Risk Managament    1
Quantitative risk management                              1
dtype: int64


In [42]:
phd_regexs = ["^(PhD)"]
standard_9 = set_to_standard(standard_8, phd_regexs, 'PHD')

The following 4 values have been changed to: PHD

PhD                   2
PhD student at FGB    1
PhD student           1
dtype: int64


In [43]:
others_regexs = [r".*[^AI|BI|BA|BDE|CLS|CS|EC|QRM|PHD]$"]
standard_10 = set_to_standard(standard_9, others_regexs, 'others')

The following 12 values have been changed to: others

Exchange                            2
Mathematics Exchange                1
Physics                             1
AI premaster                        1
Mathematics                         1
Master Human Movement Science       1
Data Mining Techniques              1
Drug discovery and safety           1
Finance                             1
B Science, Business & Innovation    1
21-05-1995                          1
dtype: int64


In [44]:
others_regexs_2 = [r"MS|MPA|CSL"]
standard_11 = set_to_standard(standard_10, others_regexs_2, 'others')

The following 4 values have been changed to: others

MS     2
MPA    1
CSL    1
dtype: int64


In [45]:
standard_11.value_counts()

AI        50
BA        44
BI        28
CLS       22
EC        22
others    16
CS        14
QRM       10
BDE        7
PHD        4
Name: program, dtype: int64

In [47]:
standard['program'] = standard_11.values

In [48]:
print(standard.head())

       goodDay1 goodDay2 program
1  productivity    sleep     QRM
2         other    other      CS
3          food    sport      BA
4         other    other      BA
5         sleep   social     BDE


Excellent. Now standardize the binary features.

In [50]:
# Function for making all features either yes or no
def convert_yes_no(df):
    for i in range(len(df)):

        if df.values[i] == '0' or df.values[i] == 'sigma' or df.values[i] == 'nee':
            df.values[i] = 'no'

        elif df.values[i] == '1' or df.values[i] == 'mu' or df.values[i] == 'ja':
            df.values[i] = 'yes'

    return df

In [51]:
# Machine Learning course feature
mach_learn = convert_yes_no(odi_df.MLcourse)
print(mach_learn.head())

1     no
2     no
3    yes
4    yes
5     no
Name: MLcourse, dtype: object


In [52]:
standard['MLcourse'] = mach_learn.values
print(standard.head())

       goodDay1 goodDay2 program MLcourse
1  productivity    sleep     QRM       no
2         other    other      CS       no
3          food    sport      BA      yes
4         other    other      BA      yes
5         sleep   social     BDE       no


In [53]:
# information retrieval course feature
info_course = convert_yes_no(odi_df.IRcourse)
standard['IRcourse'] = info_course.values
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse
1  productivity    sleep     QRM       no       no
2         other    other      CS       no       no
3          food    sport      BA      yes      yes
4         other    other      BA      yes      yes
5         sleep   social     BDE       no       no


In [54]:
# stats course feature
stat = convert_yes_no(odi_df.StatsCourse)
standard['StatsCourse'] = stat.values
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse
1  productivity    sleep     QRM       no       no         yes
2         other    other      CS       no       no     unknown
3          food    sport      BA      yes      yes          no
4         other    other      BA      yes      yes         yes
5         sleep   social     BDE       no       no          no


In [55]:
# database course feature
db_course = convert_yes_no(odi_df.DBcourse)
standard['DBcourse'] = db_course.values
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse
1  productivity    sleep     QRM       no       no         yes       no
2         other    other      CS       no       no     unknown      yes
3          food    sport      BA      yes      yes          no      yes
4         other    other      BA      yes      yes         yes      yes
5         sleep   social     BDE       no       no          no      yes


In [56]:
def convert_noIdea(df, unwanted):
    for i in range(len(df)):

        if df.values[i] == unwanted:
            df.values[i] = 'no idea'
    return df

In [57]:
choco = convert_noIdea(odi_df.choco_op, 'I have no idea what you are talking about')
standard['choco_op'] = choco.values
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op  
1  neither  
2  no idea  
3  neither  
4  no idea  
5  no idea  


In [58]:
standard['gender'] = odi_df.gender
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op gender  
1  neither   male  
2  no idea   male  
3  neither   male  
4  no idea   male  
5  no idea   male  


In [59]:
standard['stand'] = odi_df.stand
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op gender stand  
1  neither   male    no  
2  no idea   male   yes  
3  neither   male    no  
4  no idea   male    no  
5  no idea   male    no  


Now standardize the bedtime

In [60]:
def remove_AmPm(series):
    new_series = []
    for line in series:
        
        new1 = line.replace('m', '')
        new2 = new1.replace('p', '')
        new3 = new2.replace('a', '')
        new4 = new3.replace('A', '')
        new5 = new4.replace('M', '')
        new6 = new5.replace('P', '')
        new7 = new6.replace(' ', '')
        new8 = new7.replace('.', ':')
        new_series.append(new8)
        
    return new_series

# Try to get rid of any am/pm that might be present
Series1 = remove_AmPm(odi_df.bedtime)

In [61]:
def add_zeros(series):
    new_series = []
    
    justone = r'\d{1,2}(?!(:|\d))'

    
    for line in series:
        m = match(justone, line)
        
        if m is not None:
            line+=':00'
            
        new_series.append(line)
          
    return new_series

Series2 = add_zeros(Series1)

In [62]:
def finish(series):
    series_1 = []
    series_2 = []
    series_3 = []
    # This should specify for 3:0 which we need to change to 3:00
    twodigits = r'\d*:\d{1}(?!\d)'
    
    # Put a zero in front of 1-9:00
    firstdigits = r'\d{1}(?<=\d):\d{2}'

    
    for line in series:
        m = match(twodigits, line)
        
        if m is not None:
            line+='0'
            
        series_1.append(line)
        
    for line in series_1:
        m = match(firstdigits, line)
        if m is not None:
            new_value = '0'+line
            
        else:
            new_value = line
        series_2.append(new_value)
            

    return series_2

Series3 = finish(Series2)


In [63]:
def insertChar(mystring, position, chartoinsert ):
    longi = len(mystring)
    mystring   =  mystring[:position] + chartoinsert + mystring[position:] 
    return mystring   

def addcolon(series):
    new_series = []
    
    for line in series:
        
        if len(line)==3:
            newval = insertChar(line, 1, ':')
            new_series.append(newval)
        elif len(line)==4:
            newval = insertChar(line, 2, ':')
            new_series.append(newval)
        else:
            new_series.append(line)
    
    return new_series

Series4 = addcolon(Series3)

In [64]:
def nochar(series):
    new_series = []
    
    right = r'\d{2}:\d{2}'
    extra = r'\d*:{1,}'
    
    for line in series:
        r = match(right, line)
        e = match(extra, line)
        
        if r is not None:
            new_series.append(line)
            
        elif e is not None:
            newval = line.replace(':::', ':00')
            new_series.append(newval)
        else:
            new_series.append('NA')
   
    return new_series

Series5 = nochar(Series4)

In [65]:
def clock(series):
    new_series = []
    final_series = []
    reg = r'(0[6-9]|1[0-2]):\d*'
    
    for line in series:
        m = match(reg, line)
        
        if m is not None:
 
            oldtime = int(line[:2])
            newtime = str(oldtime + 12)
            new_value = newtime + line[2:]
            new_series.append(new_value)
            
        else:
            new_series.append(line)
            
    for line in new_series:
        if line[:2] == '24':
            new = line.replace('24', '00')
            final_series.append(new)
        else:
            final_series.append(line)
        
    return final_series

Series6 = clock(Series5)

In [66]:
print(Series6)

['01:00', 'NA', '23:00', '00:30', '00:00', '21:00', '23:00', '00:00', '01:00', '02:00', '00:00', '23:45', '00:30', '01:00', '20:00', '23:30', '01:30', '01:00', '23:30', 'NA', '23:59', '23:20', '2:00', '23:30', '23:30', '01:30', '23:30', '22:20', '23:30', '01:20', '23:55', '01:00', '23:00', '02:00', 'NA', '01:00', '01:00', '01:00', '01:00', '01:15', 'NA', '01:00', '02:00', '22:00', '00:40', '23:00', '23:55', '23:00', '23:00', '23:15', '01:00', '00:00', '23:50', '01:00', '23:00', '23:30', '00:30', '01:00', '00:15', '23:15', '23:00', '01:00', '00:00', '23:05', '01:00', '02:00', '01:00', '02:00', '22:00', 'NA', '23:03', '23:00', '23:59', '01:30', '22:10', '00:30', '01:00', '01:15', '23:15', '00:00', '23:58', '01:12', '01:30', '00:30', '23:30', '04:00', '01:00', '00:30', '00:30', '01:00', '23:00', '00:00', '22:30', '23:30', '00:00', '00:30', '22:00', '02:00', '23:30', '02:00', '23:00', '22:30', '02:00', '22:45', '23:00', '01:30', '00:00', '00:00', '01:00', '01:48', '03:00', '02:00', '01:30'

In [84]:
Series7 = finish(Series6)
print(Series7)

['01:00', 'NA', '23:00', '00:30', '00:00', '21:00', '23:00', '00:00', '01:00', '02:00', '00:00', '23:45', '00:30', '01:00', '20:00', '23:30', '01:30', '01:00', '23:30', 'NA', '23:59', '23:20', '02:00', '23:30', '23:30', '01:30', '23:30', '22:20', '23:30', '01:20', '23:55', '01:00', '23:00', '02:00', 'NA', '01:00', '01:00', '01:00', '01:00', '01:15', 'NA', '01:00', '02:00', '22:00', '00:40', '23:00', '23:55', '23:00', '23:00', '23:15', '01:00', '00:00', '23:50', '01:00', '23:00', '23:30', '00:30', '01:00', '00:15', '23:15', '23:00', '01:00', '00:00', '23:05', '01:00', '02:00', '01:00', '02:00', '22:00', 'NA', '23:03', '23:00', '23:59', '01:30', '22:10', '00:30', '01:00', '01:15', '23:15', '00:00', '23:58', '01:12', '01:30', '00:30', '23:30', '04:00', '01:00', '00:30', '00:30', '01:00', '23:00', '00:00', '22:30', '23:30', '00:00', '00:30', '22:00', '02:00', '23:30', '02:00', '23:00', '22:30', '02:00', '22:45', '23:00', '01:30', '00:00', '00:00', '01:00', '01:48', '03:00', '02:00', '01:30

In [85]:
from time import gmtime, strftime

In [93]:
def decimaltime(series):
    new_series = []
    
    for time in series:
        
        
        if time == 'NA':
            new_series.append(time)
        
        else:
            minute = time[-2:]
            if minute != '00':
                new_minute = 60/int(minute)
                new_time = time[:2]
                new_time = new_time+'.'+str(new_minute)
                new_series.append(new_time)

            else:
                new_time = time[:2]
                new_time = new_time+'.'+str(00)
                new_series.append(new_time)
        
        
    print(new_series)    
    return new_series

Series8 = decimaltime(Series7)

['01.0', 'NA', '23.0', '00.2', '00.0', '21.0', '23.0', '00.0', '01.0', '02.0', '00.0', '23.1', '00.2', '01.0', '20.0', '23.2', '01.2', '01.0', '23.2', 'NA', '23.1', '23.3', '02.0', '23.2', '23.2', '01.2', '23.2', '22.3', '23.2', '01.3', '23.1', '01.0', '23.0', '02.0', 'NA', '01.0', '01.0', '01.0', '01.0', '01.4', 'NA', '01.0', '02.0', '22.0', '00.1', '23.0', '23.1', '23.0', '23.0', '23.4', '01.0', '00.0', '23.1', '01.0', '23.0', '23.2', '00.2', '01.0', '00.4', '23.4', '23.0', '01.0', '00.0', '23.12', '01.0', '02.0', '01.0', '02.0', '22.0', 'NA', '23.20', '23.0', '23.1', '01.2', '22.6', '00.2', '01.0', '01.4', '23.4', '00.0', '23.1', '01.5', '01.2', '00.2', '23.2', '04.0', '01.0', '00.2', '00.2', '01.0', '23.0', '00.0', '22.2', '23.2', '00.0', '00.2', '22.0', '02.0', '23.2', '02.0', '23.0', '22.2', '02.0', '22.1', '23.0', '01.2', '00.0', '00.0', '01.0', '01.1', '03.0', '02.0', '01.2', '23.4', '00.1', '01.0', '02.0', '01.0', '23.2', '23.2', '23.3', '23.0', '00.2', '00.2', '23.1', '23.1',

In [94]:
standard['bedtime'] = Series8
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op gender stand bedtime neighbors randomNum  
1  neither   male    no    01.0       300         7  
2  no idea   male   yes      NA       100    394749  
3  neither   male    no    23.0         5         6  
4  no idea   male    no    00.2         2         8  
5  no idea   male    no    00.0         6         8  


Now # of neighbors

In [95]:
# Returns floats, cutoff is 300, NA if there were any words present.
def cutoff(series):
    new_series = []
    
    # To find characters
    reg = r'.*([a-z]|>|/).*'
    
    for line in series:
        m = match(reg, line)
        
        if m is not None:
            new_series.append('NA')
        elif float(line)>300:
            new_value = 300
            new_series.append(new_value)
        else:
            new_series.append(float(line))
            
    print(new_series)
    return new_series

neighbors = cutoff(odi_df.neighbors)

[300, 100.0, 5.0, 2.0, 6.0, 4.0, 5.0, 5.0, 2.0, 7.0, 8.0, 6.0, 8.0, 6.0, 2.0, 1.0, 1.0, 3.0, 2.0, 1.0, 7.0, 2.0, 1.0, 5.0, 8.0, 8.0, 2.0, 2.0, 5.0, 1.0, 7.0, 4.0, 0.0, 3.0, 7.0, 1.0, 0.0, 5.0, 6.5, 'NA', 1.0, 300, 2.0, 2.0, 5.0, 300, 300, 1.0, 0.0, 7.0, 8.0, 4.0, 7.0, 10.0, 'NA', 4.0, 1.0, 8.0, 2.0, 6.0, 300, 3.0, 2.0, 7.0, 2.0, 3.0, 9.0, 2.0, 2.0, 5.0, 1.0, 7.0, 4.0, 7.0, 6.0, 1.0, 1.0, 4.0, 1.0, 1.0, 8.0, 7.0, 7.0, 1.0, 1.0, 1.0, 5.0, 6.0, 5.0, 9.0, 2.0, 300.0, 0.0, 280.0, 2.0, 7.0, 6.0, 25.0, 1.0, 10.0, 1.0, 4.0, 5.0, 3.0, 10.0, 6.0, 5.0, 2.0, 6.0, 1.0, 1.0, 4.0, 4.0, 6.0, 6.0, 1.0, 4.0, 3.0, 300.0, 7.0, 2.0, 7.0, 8.0, 8.0, 8.0, 7.0, 4.0, 8.0, 9.0, 20.0, 3.0, 1.0, 2.0, 1.0, 1.0, 8.0, 10.0, 1.0, 5.0, 1.0, 2.0, 3.0, 7.0, 4.0, 1.0, 2.0, 2.0, 6.0, 6.0, 2.0, 7.0, 5.0, 8.0, 2.0, 6.0, 4.0, 'NA', 0.0, 'NA', 7.0, 0.0, 6.0, 6.0, 300, 24.0, 6.0, 6.0, 2.0, 6.0, 2.0, 2.0, 2.0, 8.0, 8.0, 1.0, 8.0, 1.0, 7.0, 2.0, 1.0, 2.0, 1.0, 4.0, 5.0, 3.0, 3.0, 2.0, 0.0, 3.0, 6.0, 3.0, 3.0, 5.0, 1.0, 4.0, 5.0, 

In [96]:
standard['neighbors'] = neighbors
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op gender stand bedtime neighbors randomNum  
1  neither   male    no    01.0       300         7  
2  no idea   male   yes      NA       100    394749  
3  neither   male    no    23.0         5         6  
4  no idea   male    no    00.2         2         8  
5  no idea   male    no    00.0         6         8  


In [97]:
# Turn the random numbers into floats and get rid of any words present.
def numbers(series):
    new_series = []
    reg = r'.*([a-z]|[A-Z]|>|/).*'
    
    for line in series:
        m = match(reg, line)
        
        if m is not None:
            new_value = 'NA'
            new_series.append(new_value)
            
        else:
            new_series.append(float(line))
            
    print(new_series)
    return new_series

randomNum = numbers(odi_df.rndnum)

[7.0, 394749.0, 6.0, 8.0, 8.0, 'NA', 6739.0, 3.0, 78952097.0, 5.0, 8.0, 42.0, 37.0, 7727528.0, 187.0, 7.0, 7.0, 6.0, 34.0, 7.0, 6.283949577437364e+36, 2.0, 2.0, 6.0, 7.0, 4.0, 'NA', 9.0, 4.0, 3.0, 9293746289201.0, 5.0, 3.0, 347.0, 19.0, 9.0, 1.0, 2.0, 11.0, 2.0, 2.0, 9.0, 5.0, 9.0, 32.0, 5.0, 4.0, 8.0, 4578994211.0, 33.0, 7.0, 7.0, 4.0, 6.0, 9.0, 3.0, 7.0, 8.0, 6.0, 8.0, 7.0, 1.0, 8.0, 8.0, 18.0, 2.0, 'NA', 1.0, 7.0, 0.0, 7.0, 3.0, 5.0, 7.0, 7.0, 1.0, 9.0, 7.0, 8.0, 8.0, 3.0, 4.0, 13.0, 8.0, 3.0, 4.0, 5.0, 9.0, 7.0, 5.0, 3.0, 1.0, 6.0, 1.0, 8.0, 3.0, 2.0, 5.0, 7.0, 7.0, 2.0, 1.0, 9.0, 8.0, 8.0, 7.0, 8.0, 33.0, 7.44, 46371646292.0, 9.0, 2.0, 8.0, 9.0, 3.0, 9.0, 8.0, 6.9, 9.0, 75389275660.0, 9.0, 8.0, 7.0, 7.0, 4.0, 9.0, 4.0, 6.0, 7.0, 1.0, 8.0, 2.0, 10.0, 22.0, 8.0, 123581321.0, 1987.0, 4.0, 10.0, 1.0, 10.0, 7.0, 4.0, 1.0, 6.0, 7.0, 7.0, 9.0, 8.0, 1.0, 2.0, 7.0, 3.0, 537.0, 6.0, 1.0, 'NA', 3.0, 3.0, 7.0, 4.0, 8.0, 4.0, 10.0, 8.0, 1.0, 8.0, 7.0, 2.0, 7.0, 6.0, 7.0, 4.0, 6.0, 10.0, 3.0, 4

It may be more useful to change to NA values to 0, depending on what it is you want to do.

In [98]:
standard['randomNum'] = randomNum
print(standard.head())

       goodDay1 goodDay2 program MLcourse IRcourse StatsCourse DBcourse  \
1  productivity    sleep     QRM       no       no         yes       no   
2         other    other      CS       no       no     unknown      yes   
3          food    sport      BA      yes      yes          no      yes   
4         other    other      BA      yes      yes         yes      yes   
5         sleep   social     BDE       no       no          no      yes   

  choco_op gender stand bedtime neighbors randomNum  
1  neither   male    no    01.0       300         7  
2  no idea   male   yes      NA       100    394749  
3  neither   male    no    23.0         5         6  
4  no idea   male    no    00.2         2         8  
5  no idea   male    no    00.0         6         8  


In [99]:
standard.to_csv('../data/standard.csv')