## Bayesian Network
In order to proceed with the creation of the probabilistic model proposed, some previous operations on data need to be performed. These operations are:
1. Integrating sensors data from GPS, time, Bluetooth and/or mood from time diaries (which have already been done in previous notebooks).
2. Grouping activities into homogenous categories, to reduce the number of values and make the results more interpretable.
3. Establish a probabilistic relations between variables.

This notebook is dedicated to the second of these points.

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('location_4Ws_bluetooth.csv')
df = df.drop(["Unnamed: 0"], axis=1)

df = df.sort_values(by=['City'], ascending=True) 

# drop row if both "with_who" and "bluetooth" values are NaN
df = df.dropna(subset=["with_who", "bluetooth"], how='all')
df = df.reset_index()

In [None]:
df = df.drop(["index"], axis=1)

In [None]:
df

### Removing Bluetooth not corresponding to people

In [None]:
# removing bluetooth devices that do not correspond to people (i.e., computers and TV)
for row in range(len(df)):
    if pd.isna(df.bluetooth[row]) == False:
        new = []
        if 'TV' in df.bluetooth[row] or 'DESKTOP' in df.bluetooth[row] or 'LAPTOP' in df.bluetooth[row]:
            old = df.bluetooth[row]
            #print(new)
            import re
            # regular expression
            # re.findall("START.*?(?:END|ALTERNATIVE_END|ALTERNATIVE_END2)", s)
            l = re.findall("'.*?(?:')", old)
            for el in l:
                if 'TV' not in el and 'DESKTOP' not in el and 'LAPTOP' not in el:
                    new.append(el[1:-1])
            df.at[row, 'bluetooth'] = str(new)

### Recoding activities:

In [None]:
# Drop records where target (what, the activity to predict) is NaN
df = df[pd.isnull(df['what'])==False]
df = df.reset_index()

In [None]:
df = df.drop(["index"], axis=1)

In [None]:
df.what.unique()

In [None]:
# Recode activities into broader categories
for row in range(len(df)):
    if df.what[row] == 'Walking' or df.what[row] == 'Sport: Walking, Trekking and hiking':
        df.at[row, 'what'] = 'Physical activities'
        
    if df.what[row] == 'Other':
        df.at[row, 'what'] = 'Others'
        
    if df.what[row] == 'Grocery Shopping' or df.what[row] == 'Other Shopping'\
    or df.what[row] == 'Cooking, food preparation & management':
        df.at[row, 'what'] = 'Shopping and Household'
        
    if df.what[row] == 'Study/work group' or df.what[row] == 'Lecture/seminar/conference/university meeting':
        df.at[row, 'what'] = 'Study/Lectures'
        
    if df.what[row] == 'Phone/Video calling; Skype/Zoom/WhatsApp/Messenger or other VoIP' \
    or df.what[row] == 'Social life (Socialising, visiting, receiving, conversating with family, relatives, friends, classmate, visitors, neighbour, and others)' \
    or df.what[row] == 'Happy Hour/Drinking/Party' or df.what[row] == 'Social media (Facebook Instagram etc.)':
        df.at[row, 'what'] = 'Social activities'
        
    if df.what[row] == 'Listening to music' or df.what[row] == 'Reading a book, periodicals, news, etc.' \
    or df.what[row] == 'Hobbies (assembling/repair apparatus/pc, gardening, etc.)' \
    or df.what[row] == 'Watching TV, video, YouTube, etc.' \
    or df.what[row] == 'Surfed or seeking, reading information via Internet' \
    or df.what[row] == 'In chat on Internet or reading, sending e-mail':
        df.at[row, 'what'] = 'Free time/hobbies'
        
    if df.what[row] == 'Break (coffee, cigarette, drink, etc.)' or df.what[row] == 'Personal care' \
    or df.what[row] == 'Rest/nap'  \
    or df.what[row] == 'Did not do anything special (Just let the time pass, Lazed around, etc.)'\
    or df.what[row] == 'Eating':
        df.at[row, 'what'] = 'Breaks'
        

In [None]:
df.what.unique()

In [None]:
df.head()

In [None]:
df.with_who.isna().unique()

In [None]:
df.with_who.unique()

### Recoding number of people around in the considered moment

In [None]:
# For other columns with missing values (bluetooth only), fill them 
df = df.fillna('unknown')

In [None]:
# Create bands for variables that we want to use in the model
import ast
for row in range(len(df)):
    if df.bluetooth[row] != 'unknown':
        lst = ast.literal_eval(df.bluetooth[row])
        if len(lst)<3 and len(lst)>0:
            df.at[row, 'Num_of_people'] = '1 - 3'
        elif len(lst)==0:
            df.at[row, 'Num_of_people'] = '0'
        else:
            df.at[row, 'Num_of_people'] = '>= 4'
    else:
        if df.with_who[row] == 'Alone':
            df.at[row, 'Num_of_people'] = '0'
        elif df.with_who[row] == 'Roommate(s)':
            df.at[row, 'Num_of_people'] = '1 - 3'
        else:
            df.at[row, 'Num_of_people'] = 'unknown'

Missing recodification for some values of 'Friend(s)', 'Relative(s)', and 'Classmate(s)'.

Based on my own knowledge of the events that occured in the data collection period, I can try to reconstruct these ones as well.

In [None]:
df.loc[(df['with_who'] == 'Relative(s)') & (df['Num_of_people'] == 'unknown') ]

In [None]:
# in these missing values, I was either on the phone or at my parents house, with 2/3 people,
# so I can recodify all of them as '1-3'
for row in range(len(df)):
    if df.bluetooth[row] == 'unknown':
        if df['with_who'][row] == 'Relative(s)':
            df.at[row, 'Num_of_people'] = '1 - 3'

In [None]:
df.loc[(df['with_who'] == 'Classmate(s)') & (df['Num_of_people'] == 'unknown') ]

In [None]:
# in the case of classmates, I can assume that I was with more than 4 people when the activity is study/lectures,
# as normally in a class there are more than 4 people around me,
# while I can assume I was with between 1 and 3 people, if the activity is free time.
for row in range(len(df)):
    if df.bluetooth[row] == 'unknown':
        if df.what[row] == 'Study/Lectures' and df['with_who'][row] == 'Classmate(s)':
            df.at[row, 'Num_of_people'] = '>= 4'
        elif df.what[row] == 'Free time/hobbies' or df.what[row] == 'Physical activities' and df['with_who'][row] == 'Classmate(s)':
            df.at[row, 'Num_of_people'] = '1 - 3'

In [None]:
df.loc[(df['with_who'] == 'Friend(s)') & (df['Num_of_people'] == 'unknown') ]

In [None]:
# While all the other were pretty straight forward to fill, this one may be object of some errors,
# some assumptions need to be made
for row in range(len(df)):
    if df.bluetooth[row] == 'unknown':
        # my own room -> category 1-3 for sure
        if df.place[row] == 'Home apartment /room' and df['with_who'][row] == 'Friend(s)':
            df.at[row, 'Num_of_people'] = '1 - 3'
        # given the activity and the corresponding place category, >= 4 seems to be the most probable class
        # for these 14 data point
        elif df.what[row] == 'Social activities' and df['with_who'][row] == 'Friend(s)':
            df.at[row, 'Num_of_people'] = '>= 4'
        # when shopping, I was for sure with no more than other two people
        elif df.what[row] == 'Shopping and Household' or df.what[row] == 'Physical activities' and df['with_who'][row] == 'Friend(s)': 
            df.at[row, 'Num_of_people'] = '1 - 3'
        elif df.place[row] == 'Restaurant, pizzeria, Street food vendor':
            df.at[row, 'Num_of_people'] = '1 - 3'

In [None]:
df.loc[df['Num_of_people'] == 'unknown'] # no more unknown values

### Recode subjective place

In [None]:
df[pd.isna(df.place)]

In [None]:
df.place.unique()

In [None]:
# re-code places to abbreviate some names, just for simplicity & put together similar places

# df.location.unique()
# ['Supermarket...', 'Home apartment /room', 'In the street',
#        'Classroom / Study hall', 'Another outdoor place',
#        'Other university place',
#        'Shops, shopping centres, indoor markets, other shops',
#        'Restaurant, pizzeria, Street food vendor', 'Grocery Shop',
#        'Café, pub, bar', 'Relatives Home']
  
for row in range(len(df)):
    if df.place[row] == 'Supermarket...' or df.place[row] == 'Grocery Shop':
        df.at[row, 'place'] = 'Supermarket'
    if df.place[row] == 'Another outdoor place' or df.place[row] == 'In the street':
        df.at[row, 'place'] = 'Outdoor'
    if df.place[row] == 'Home apartment /room':
        df.at[row, 'place'] = 'Home'
    if df.place[row] == 'Classroom / Study hall' or df.place[row] == 'Other university place':
        df.at[row, 'place'] = 'University'
    if df.place[row] == 'Shops, shopping centres, indoor markets, other shops':
        df.at[row, 'place'] = 'Shops'
    if df.place[row] == 'Restaurant, pizzeria, Street food vendor':
        df.at[row, 'place'] = 'Restaurant'
    if df.place[row] == 'Café, pub, bar':
        df.at[row, 'place'] = 'Bar'


In [None]:
df.head()

In [None]:
# remove columns not needed for the BN
df = df.drop(['lat', 'long', 'bluetooth', 'City', 'with_who'], axis=1)
df

### Recode time: 
instead of timestamps put time of the day as morning, afternoon, evening/night

In [None]:
# recod most frequent time of the day in morning - afternoon - night
for row in range(len(df)):
    if int(str(df.timestamp[row])[-4:-2]) <= 12:
        df.at[row, 'time'] = 'Morning'
    elif int(str(df.timestamp[row])[-4:-2]) < 20 and int(str(df.timestamp[row])[-4:-2]) > 12:
        df.at[row, 'time'] = 'Afternoon'
    else:
        df.at[row, 'time'] = 'Evening/Night'        

In [None]:
df = df.drop(['timestamp'], axis=1)

In [None]:
df

### Save

In [None]:
df.to_csv('BBN_data.csv')