## Load bluetooth and location datasets

In [None]:
import pandas as pd

In [None]:
location = pd.read_csv('SHB_data_luisa/locationeventpertime.csv')

In [None]:
columns_name = ['experimentid', 'userid', 'day', 'timestamp', 'address', 'bondstate', 'none', 'none2', 'name', 'rssi']
bluetooth = pd.read_csv('SHB_data_luisa/bluetoothnormalevent.csv', 
                        names=columns_name, skiprows=1, header=None)

In [None]:
location.head()

In [None]:
bluetooth.head()

In [None]:
location = location.drop(["experimentid", "userid", "lucene"], axis=1)
bluetooth = bluetooth.drop(["experimentid", "userid", "none", "none2"], axis=1)

In [None]:
print(len(location), len(bluetooth))

## Bluetooth pre-processing

In [None]:
# remove records without any device name
bluetooth = bluetooth[bluetooth['name'].notna()]
print(len(bluetooth))

In [None]:
# if same timestamp, put list of devices in in a new column (for each pt in time) -> not different rows with same timestamp
# bluetooth['devices_list'] = pd.Series([])
i_time = bluetooth.timestamp.unique()
i_time.sort()

In [None]:
len(i_time) # unique values of timestamp --> same as num rows thus no duplicates 

In [None]:
bluetooth = bluetooth.sort_values(by=['timestamp'], ascending=True) 

Delete seconds and decimals from timestamp, merge devices nearby by timestamp, so that to each reduced timestamp correspond a list of devices nearby at that time.

In [None]:
bluetooth['timestamp'].isnull().values.any()

In [None]:
# reduce timestamp
bluetooth.timestamp = bluetooth.timestamp.astype(str)
bluetooth.timestamp = bluetooth.timestamp.apply(lambda x: x[0:12])
bluetooth

In [None]:
# bluetooth = bluetooth.drop(["address"], axis=1)
# bluetooth.sort_values(by=['timestamp'])
# print(len(bluetooth.timestamp.unique()))

### Considering rssi to compute distances between devices

In [None]:
bluetooth = bluetooth.drop(["address", "bondstate"], axis=1)
bluetooth.sort_values(by=['timestamp'])
print(len(bluetooth.timestamp.unique()))
bluetooth.sort_values(by=['rssi'], ascending=False)

In [None]:
bluetooth = bluetooth.reset_index()
bluetooth = bluetooth.drop(["index"], axis=1)

In [None]:
dist = []
for row in range(len(bluetooth)):
    distance = 10 ** ((-69 -(bluetooth.rssi[row]))/(10 * 2))
    dist.append(distance)
bluetooth['Distance_meters'] = dist

In [None]:
bluetooth.sort_values(by=['rssi'], ascending=False)

In [None]:
# sorted(bluetooth['Distance_meters'].unique())

In [None]:
# keep only devices within 3.5 meters
bluetooth = bluetooth.drop(bluetooth[(bluetooth.Distance_meters > 3.5)].index)
bluetooth.sort_values(by=['rssi'], ascending=False)

### Removing bluetooth devices not identifying people

In [None]:
# # Checking for keywords to use to remove devices not identifying people
bluetooth.name.unique()

In [None]:
bluetooth = bluetooth.reset_index()

In [None]:
# removing bluetooth devices that do not correspond to people (i.e., computers and TV)
for row in range(len(bluetooth)):
    if bluetooth.name[row].startswith('TV') or bluetooth.name[row].startswith('DESKTOP') or bluetooth.name[row].startswith('LAPTOP'):
        bluetooth = bluetooth.drop([row])
# df.bluetooth.unique()

In [None]:
bluetooth

### Build list of devices for same timestamp

In [None]:
i_time = bluetooth.timestamp.unique()
i_time.sort()
print(len(i_time))

In [None]:
df = pd.DataFrame(columns=['timestamp', 'devices'])
index_time=0
index_row=0
devices_list=[]
while True:
    i=i_time[index_time]
    if bluetooth.iloc[index_row].at['timestamp'] == i:
        devices_list.append(bluetooth.iloc[index_row].at['name'])
        index_row+=1
    else:
        if devices_list != []:
            devices_list = list(set(devices_list))
            df = df.append({'timestamp': i, 'devices': devices_list}, ignore_index=True) 
        devices_list=[]
        index_time+=1
    if index_time >= len(i_time) or index_row >= len(bluetooth.timestamp)-1:
        # last append before break, else missing last row
        if devices_list != []:
            devices_list = list(set(devices_list))
            df = df.append({'timestamp': i, 'devices': devices_list}, ignore_index=True) 
        break
# should have 505 rows, one for each timestamps, with respective lists of devices

In [None]:
df 

## Save

In [None]:
df.to_csv('processed_bluetooth.csv')