In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import *
import glob
import scipy 
from datetime import datetime as dt
import sklearn
sns.style = 'darkgrid'
import ruptures as rpt

In [2]:
# read the audio file skipping three out of every 4 rows
audio = pd.read_csv('tables/audio/audio.csv', skiprows = [i if i%60 != 0 else 1 for i in range(100000)])

In [3]:
audio

Unnamed: 0,timestamp,audio inference,uid,time,date
0,1364342426,0,u18,2013-03-27 00:00:26,2013-03-27
1,1364342430,0,u18,2013-03-27 00:00:30,2013-03-27
2,1364342433,0,u18,2013-03-27 00:00:33,2013-03-27
3,1364342437,0,u18,2013-03-27 00:00:37,2013-03-27
4,1364342441,0,u18,2013-03-27 00:00:41,2013-03-27
...,...,...,...,...,...
99297468,1369169620,0,u20,2013-05-21 20:53:40,2013-05-21
99297469,1369169621,0,u20,2013-05-21 20:53:41,2013-05-21
99297470,1369169622,0,u20,2013-05-21 20:53:42,2013-05-21
99297471,1369169623,0,u20,2013-05-21 20:53:43,2013-05-21


In [4]:
def location_intervals(indoor_locations): 
    """
    given indoor_locations dataframe, returns a list of tuples [(start timestamp, end timestamp)] that represents intervals
    where the person was in those locations continuously (sensors within 30 minutes of each other)
    """
    # the following code creates a list of tuples which represent the ranges of timestamps where users were inside
    start_timestamp = None
    loc_intervals = []
    for i in indoor_locations.index:
        
        current_timestamp = indoor_locations.loc[i]['timestamp']
        
        if start_timestamp is None: 
            start_timestamp = current_timestamp
            
        # don't consider locations continuous when sensed more than 1/2 hour apart
        if current_timestamp > start_timestamp + 1800:
            loc_intervals.append((start_timestamp, current_timestamp))
            start_timestamp = None
            continue
        try: 
            # if the next index is not inside, we go to the except loop, otherwise continue
            indoor_locations.loc[i + 1]
            continue
        except: 
            # in the event the next index is not inside, this is the end of the time range so we append that to the intervals
            if current_timestamp - start_timestamp >= 1800: 
                loc_intervals.append((start_timestamp, current_timestamp))
            start_timestamp = None
            continue
    

    return loc_intervals

In [5]:
def party_intervals(audio, loc_intervals): 
    """
    given input audio dataframe and loc_intervals containing timestamp location intervals
    """
    loc_index = 0
    # this will hold the intervals to return in format (start, end, avg audio inference)
    total_audio_intervals = []
    # this will hold the intermediate steps for a single interval in the total list above
    # loop through all audio inferences
    count = 0 
    for inter in loc_intervals: 
        
        num_silent_labels = 0
        num_current_labels = 0
    
        int_start= inter[0]
        int_end = inter[1]
        
        done = False
        
        while done is False: 
            try: 
                time = audio.iloc[count]['timestamp']
            except: 
                done = True 
                continue
                
            if time < int_start: 
                count += 1
            elif time > int_end: 
                if num_current_labels > 0: 
                    
                    silent_labels = num_silent_labels/num_current_labels  
                    
                    if silent_labels < .4: 
                        total_audio_intervals.append((int_start, int_end, 1-silent_labels))
                        
                done = True
            else: 
                if audio.iloc[count][' audio inference'] == 0: 
                    num_silent_labels += 1
                num_current_labels += 1
                count += 1
                continue
    
    
    """
    for i in audio.index: 
        time = audio.loc[i]['timestamp']
        done = False
        while done is False: 
            # find the overlap between audio files and the location intervals
            try: 
                int_start = loc_intervals[loc_index][0]
                int_end = loc_intervals[loc_index][1]
            except: 
                done = True
                continue
            # if the audio time is before
            if time < int_start: 
                done = True
                continue
            elif time > int_end: 
                loc_index += 1
                if num_current_labels > 0: 
                    silent_labels = num_silent_labels/num_current_labels  
                    
                    if int_end - int_start >= 1200 and silent_labels < 0.4: 
                        total_audio_intervals.append((int_start, int_end, 1-silent_labels))
                continue
            else: 
                if audio.loc[i][' audio inference'] == 0: 
                    num_silent_labels += 1
                num_current_labels += 1
                done = True
                continue
        """
    return total_audio_intervals

In [9]:
def find_partying(audio, wifi_locations, party_locs): 
    """
    inputs: audio dataframe, wifi_locations dataframe, and potential party locations
    outputs: the intervals of wifi location where the corresponding audio inference was >= .4 where the user was also in
    a party location. 
    """
    
    party_loc_data = wifi_locations[wifi_locations['location'].isin(party_locs)]
    
    total_party_df = pd.DataFrame()
    
    for uid in wifi_locations['uid'].unique(): 
        
        print(uid)
        
        location_ints = location_intervals(party_loc_data[party_loc_data['uid'] == uid])
    
        audio_ints = np.asarray(party_intervals(audio[audio['uid'] == uid], location_ints))
    
        try: 
            party_df = pd.DataFrame({'start time': audio_ints[:, 0],
                                     'end time': audio_ints[:, 1], 
                                     'proportion loud labels': audio_ints[:, 2],
                                     'uid': uid})
        except: 
            continue
        party_df['duration'] = party_df['end time'] - party_df['start time']
        
        total_party_df = total_party_df.append(party_df, ignore_index = True)
        
    return total_party_df

In [10]:
party_locs = [
    'in[mclaughlin]', 'in[steele]', 'in[east-wheelock]', 'in[north-park]', 'in[north-main]', 'in[robinson]',
    'in[maxwell]', 'in[fahey-mclane]', 'in[fayerweather]', 'in[massrow]', 'in[ripley]', 'in[woodward]', 'in[butterfield]', 
    'in[cutter-north]', 'in[french]', 'in[hallgarten]', 'in[gile]', 'in[newhamp]', 'in[hitchcock]', 'in[smith]', 
    'in[channing-cox]', 'in[Cohen]', 'in[whittemore]', 'in[tllc-raether]', 'in[tllc]', 'in[richardson]', 'in[judge]', 
    'in[bissell]'
]
# the party_locs list includes all dorms and frat houses in the unique locations set
wifi_locs = pd.read_csv('tables/wifi_location/wifi_location.csv')

In [11]:
parties = find_partying(audio, wifi_locs, party_locs)

u45
u04
u18
u59
u05
u44
u09
u53
u22
u14
u25
u32
u03
u00
u54
u43
u33
u10
u50
u39
u01
u30
u35
u36
u42
u08
u17
u23
u47
u27
u34
u02
u46
u58
u13
u51
u24
u16
u15
u49
u52
u19
u56
u57
u20
u07
u12
u41
u31


In [12]:
parties.to_csv('dataset/sensing/partying/partying.csv', index = False)

In [17]:
parties['day of week'] = pd.to_datetime(parties['start time'], unit = 's').dt.dayofweek
parties['day of year'] = pd.to_datetime(parties['start time'], unit = 's').dt.dayofyear
parties

Unnamed: 0,start time,end time,proportion loud labels,uid,duration,day of week,day of year
0,1.364923e+09,1.364927e+09,0.938559,u45,4610.0,1,92
1,1.365018e+09,1.365021e+09,0.903483,u45,2296.0,2,93
2,1.365021e+09,1.365023e+09,0.913175,u45,1902.0,2,93
3,1.365189e+09,1.365193e+09,0.965458,u45,4070.0,4,95
4,1.365257e+09,1.365259e+09,0.713089,u45,1833.0,5,96
...,...,...,...,...,...,...,...
4048,1.367823e+09,1.367829e+09,0.731011,u31,5746.0,0,126
4049,1.367935e+09,1.367939e+09,0.735425,u31,3297.0,1,127
4050,1.368211e+09,1.368216e+09,0.941014,u31,4225.0,4,130
4051,1.368367e+09,1.368372e+09,0.942281,u31,5348.0,6,132
