In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import *
import glob
import scipy 
from datetime import datetime as dt
import sklearn
sns.style = 'darkgrid'

In [56]:
def location_variance(gps_df):
    """
    returns the location variance of the gps dataframe, which is log(variance of latitiude squared plus variance of 
    longitude squared)
    """
    num =  gps_df['lon'].var() + gps_df['lat'].var()
    return log(num)

def num_changes(wifi_locations): 
    changes = -1
    previous = None
    
    for location in wifi_locations['location'].values: 
        if location != previous:
            changes += 1
            previous = location
        else: 
            continue
            
    return changes

def time_in_range(start, end, x):
    """Return true if x is in the range [start, end]"""
    if start <= end:
        return start <= x <= end
    else:
        return start <= x or x <= end
    
def load_activity(uid): 
    # load activity data
    activity = pd.read_csv('dataset/sensing/activity/activity_' + uid + '.csv')
    activity['time'] = pd.to_datetime(activity['timestamp'], unit = 's') 
    activity['day'] = activity['time'].dt.dayofyear
    activity = activity[activity[' activity inference'] != 3]
    return activity

def load_conversation(uid): 
    # load conversation data
    conversation = pd.read_csv('dataset/sensing/conversation/conversation_' + uid + '.csv')
    conversation['convo duration'] = conversation[' end_timestamp'] - conversation['start_timestamp']
    conversation['day'] = pd.to_datetime(conversation['start_timestamp'], unit = 's').dt.dayofyear
    return conversation

def load_darkness(uid): 
    # load darkness data
    darkness = pd.read_csv('dataset/sensing/dark/dark_' + uid + '.csv')
    darkness['day'] = pd.to_datetime(darkness['start'], unit = 's').dt.dayofyear
    darkness['duration'] = darkness['end'] - darkness['start']
    return darkness

def load_bluetooth(uid):
    # load bluetooth data
    bluetooth = pd.read_csv('dataset/sensing/bluetooth/bt_' + uid + '.csv')
    bluetooth['time'] = pd.to_datetime(bluetooth['time'], unit = 's')
    bluetooth['day'] = bluetooth['time'].dt.dayofyear
    return bluetooth

def load_gps(uid):
    # gps data 
    gps = pd.read_csv('dataset/sensing/gps/gps_' + uid + '.csv')
    # data is out of order, this will reformat it. 
    gps.reset_index(inplace = True)
    gps.columns = ('timestamp', 'provider', 'network_type', 'accuracy', 'lat',
                   'lon', 'altitude', 'bearing' ,'speed', 'travelstate', 'null')
    gps = gps.drop("null", 1)
    gps['time'] = pd.to_datetime(gps['timestamp'], unit = 's')
    gps['day'] = gps['time'].dt.dayofyear
    return gps

def load_wifi_locations(uid): 
    # wifi locations data
    wifi_locations = pd.read_csv('dataset/sensing/wifi_location/wifi_location_' + uid + '.csv')
    wifi_locations.reset_index(inplace = True)
    wifi_locations.columns = ("timestamp", "location", "null")
    wifi_locations = wifi_locations.drop("null", 1)
    wifi_locations['time'] = pd.to_datetime(wifi_locations['timestamp'], unit = 's')
    wifi_locations['day'] = wifi_locations['time'].dt.dayofyear
    return wifi_locations

def conversation_in_range(time_interval, convo_df, start_name, end_name): 
    """
    inputs: 
        time_interval -- formatted as (start time, end time, start day, end day)
        convo_df -- a dataframe containing start and end timestamps for a duration measurement 
            (so this function can be used for darkness as well as conversation)
        start_name -- name of the column indicating the start timestamp
        end_name -- name of the column indicating the end timestamp. 
    outputs: 
        the total conversation duration in the time interval.
        
    Note -- I initially named this function for activity so the variable names reflect that, but it can be applied to
    multiple sensor data. 
    
    This function is is similar to the activity in range but applies to dataframes contianing durations so the approach is
    slightly different.  
    """
    # again, unpack interval. 
    start = time_interval[0]
    end = time_interval[1]
    start_day = time_interval[2]
    end_day = time_interval[3]
    
    # look at relevant days 
    if start_day == end_day: 
        conv = convo_df[convo_df['day'] == start_day]
    else: 
        conv = convo_df[convo_df['day'] == start_day].append(convo_df[convo_df['day'] == end_day])
    
    # turn the conversations into intervals. If none exist, the duration is 0. 
    try:
        conv['interval'] = list(zip(pd.to_datetime(conv[start_name], unit = 's'), 
                                    pd.to_datetime(conv[end_name], unit = 's')))
    except:
        return 0

    
    # this function returns the duration of conversation inside the desired interval for each time interval. 
    conv['desired duration'] = conv['interval'].apply(lambda x: conv_range(start, end, x))
    conv = conv.dropna()
    
    # return the sum of all desired intervals. 
    return conv['desired duration'].sum()

def conv_range(start, end, conv_interval): 
    """
    returns the amount of seconds of conversation are in the interval (start, end)
    """
    conv_start = conv_interval[0]
    conv_end = conv_interval[1]
    
    if conv_end < start: 
        return np.nan
    
    elif conv_start > end:
        return np.nan
    
    elif conv_start >= start and conv_end >= end:
        return end - conv_start 
    
    elif conv_start <= start and conv_end <= end:
        return conv_end - start
    
    elif conv_start >= start and conv_end <= end:
        return conv_end - conv_start
    
    elif conv_start <= start and conv_end >= end:
        return end - start
    
def convert_timedeltas(x): 
    """
    converts timedeltas to seconds, leaves any numbers
    """
    try:
        return x.seconds
    except:
        return x 
    
def activity_in_range(time_interval, activity_df, func = 'act'): 
    """
    inputs: 
        time_interval -- formatted as (start time, end time, start day, end day)
        activity_df -- dataframe for a single user. 
    outputs: 
        the mean activity inference in the time interval.
        
    Note: the activity dataframe and variable names imply 
    """
    
    # unpack the values from the time interval
    start = time_interval[0]
    end = time_interval[1]
    start_day = time_interval[2]
    end_day = time_interval[3]
    
    # only look at relevant days to say runtime
    if start_day == end_day: 
        activity = activity_df[activity_df['day'] == start_day]
    else: 
        activity = activity_df[activity_df['day'] == start_day].append(activity_df[activity_df['day'] == end_day])
        
    # this try except loop takes care of the case where the activity data is an empty dataframe, so we return Nan 
    try: 
        ### these cases are different for different func inputs so this function can be extensible. 
        
        # in this case, we are looking at activity and taking the mean
        if func == 'act':
            return activity[activity['time'].apply(lambda x: time_in_range(start, end, x))][' activity inference'].sum()
        elif func == 'all_act': 
            print(activity[activity['time'].apply(lambda x: time_in_range(start, end, x))][' activity inference'].values)
            return activity[activity['time'].apply(lambda x: time_in_range(start, end, x))][' activity inference'].values
        # in this case, we are looking at bluetooth and take the count
        elif func == 'count':
            return activity[activity['time'].apply(lambda x: time_in_range(start, end, x))].shape[0]
        # in this case we apply the location variance function 
        elif func == 'location variance': 
            return location_variance(activity[activity['time'].apply(lambda x: time_in_range(start, end, x))])
        elif func == 'location changes': 
            return num_changes(activity[activity['time'].apply(lambda x: time_in_range(start, end, x))])
    except:
        # if we find none in count, we return 0. If not, there is no data/average from there so return Nan. 
        if func == 'count': 
            return 0
        return np.nan

#This function returns a new dataframe with all of the activity durations for a particular student throughout the term.
#Only activities longer than 1 minute were considered.
#At the end, we dediced to use total activity duration (sum of activity durations per day) for our model

def activity_analysis(uid):
    activity = pd.read_csv('dataset/sensing/activity/activity_' + uid + '.csv')
    activity = activity[activity[' activity inference'] !=3]
    activity = activity.reset_index()
    #Change the path as needed when running the files on your computer.
    activity['day'] = pd.to_datetime(activity['timestamp'], unit = 's').dt.dayofyear
    daily_activity = activity.groupby('day').mean()
    def shift_counter_activity(data):
        shift_num = 0
        list_shift_num = []
        list_time = []
        list_day = []
        for i in range(0, len(data)):
            if data[' activity inference'][i] != 0:
                try: 
                    if data[' activity inference'][i+1] != 0 and (data.index[i]+1) == data.index[i+1]:
                        shift_num += 1
                    else:
                        list_shift_num.append(shift_num)
                        shift_num = 0
                except:
                    list_shift_num.append(shift_num)
                    shift_num = 0
        return list_shift_num
    activity_shifts = shift_counter_activity(activity)
    edited_act = activity[activity[' activity inference'] !=0]
    edited_act = edited_act.reset_index()
    def shifts_only(list1):
        shifts_only_list = []
        for i in list1:
            if i != 0:
                shifts_only_list.append(i)
        return shifts_only_list
    new_activity_shifts = shifts_only(activity_shifts)
    def get_sums(list1):
        list_sums_b = []
        for i in range(0,len(list1)+1):
            new_list = list1[:i]
            sums = sum(new_list)
            list_sums_b.append(sums)
        return list_sums_b
    list_sums_before_activity = get_sums(activity_shifts)
    def activity_dur(list_shift_num, data):
        time_deltas = []
        day = []
        start_time = []
        for i in range(0, len(list_shift_num)):
            if i == 0:
                time_deltas.append(data['timestamp'][list_shift_num[i]] - data['timestamp'][0])
                day.append(data.day[list_shift_num[i]+i+list_sums_before_activity[i]])
                start_time.append(data.timestamp[list_shift_num[i]+i+list_sums_before_activity[i]])
            elif i != 0:
                time_deltas.append(data['timestamp'][list_shift_num[i]+i+list_sums_before_activity[i]] - data['timestamp'][list_sums_before_activity[i]+i])
                day.append(data.day[list_shift_num[i]+i+list_sums_before_activity[i]])
                start_time.append(data.timestamp[list_shift_num[i]+i+list_sums_before_activity[i]])
        dataframe = pd.DataFrame({'Time Delta': time_deltas, 'day': day, 'Start Time': start_time})
        return dataframe
    activity_dur_df = activity_dur(activity_shifts, edited_act)
    activity_dur_df['end_time'] = activity_dur_df['Start Time'] + activity_dur_df['Time Delta']
    activity_dur_df['start_day'] = pd.to_datetime(activity_dur_df['Start Time'], unit='s').dt.dayofyear
    activity_dur_df['end_day'] = pd.to_datetime(activity_dur_df['end_time'], unit='s').dt.dayofyear
    activity_dur_df = activity_dur_df.rename(columns={'Start Time': 'start_time'})
    #activity_dur_df = activity_dur_df[activity_dur_df['Time Delta'] >= 60]
    activity_dur_day = activity_dur_df.groupby('day')['Time Delta'].sum()
    return activity_dur_df

def deadlines_processing():
    data = pd.read_csv('dataset/education/deadlines.csv')
    data = data.dropna(axis=1, how='all')
    data = data.T
    old_names = list(data.columns)
    new_names = data.iloc[0]
    data.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    data = data.drop(['uid'])
    data['doy'] = pd.to_datetime(data.index)
    data['doy'] = data['doy'].dt.dayofyear
    return data

def epoch(hour): 
    if hour >= 18: 
        return 'evening'
    elif hour < 10: 
        return 'night'
    else:
        return 'day'
    
def midterm(day): 
    if day < 21 + 86: 
        return 'pre midterm'
    elif (21 + 86) <= day <= (35 + 86):
        return 'in midterm'
    elif (35 + 86) < day:
        return 'post midterm'
    
from sklearn.preprocessing import OneHotEncoder
def ema_intervals_data(uid, window, ema_name, desired_column, before=False): 
    """
    inputs: uid -- user id 
            window -- the frame of time (in hours) of how long the interval of sensor collection around each EMA should be. 
    
    Finds desired sensor data within that window of time before and after the EMA. 
    
    Returns: a dataframe containing stress level and desired feature information for each stress response. If the
    dataframe has less than 50 elements returns none (we assume there isn't enough data with less than 50 elements). 
    """
    data = process_ema(uid, ema_name, desired_column)
    
    # define the window of time we want to look at for each stress answer. 
    data['start_time'] = data['resp_time'] - pd.to_timedelta(window, unit = 'h')
    if before is True: 
        data['end_time'] = data['resp_time']
    else: 
        data['end_time'] = data['resp_time'] + pd.to_timedelta(window, unit = 'h')
    
    data['hour'] = data['resp_time'].dt.hour
    data['epoch'] = data['hour'].apply(epoch)
    data = data.join(pd.get_dummies(data['epoch']))
    
    # this will reduce runtime by only looking at sensor data from that day then applying our interval function to it. 
    data['start_day'] = data['start_time'].dt.dayofyear
    data['end_day'] = data['end_time'].dt.dayofyear
    data['doy'] = data['resp_time'].dt.dayofyear
    
    data['dow'] = data['resp_time'].dt.dayofweek
    data = data.join(pd.get_dummies(data['dow']))
    data = data.rename(columns={0: 'Monday', 
                                1: 'Tuesday', 
                                2: 'Wednesday', 
                                3: 'Thursday', 
                                4: 'Friday',
                                5: 'Saturday',
                                6: 'Sunday'})
    
    data['midterm'] = data['doy'].apply(midterm)
    data = data.join(pd.get_dummies(data['midterm']))
    
    # the time interval is just a tuple of (start time, end time)
    # in the future, we will apply functions to the interval using other dataframes to return desired columns inside
    # the interval
    data['interval'] = tuple(zip(data['start_time'], data['end_time'], data['start_day'], data['end_day']))
    
    # load activity data
    activity = activity_analysis(uid)
    data['activity dur'] = data['interval'].apply(lambda x: conversation_in_range(x, activity, 
                                                                           'start_time', 'end_time'))
    data['activity dur'] = data['activity dur'].apply(convert_timedeltas)
    
    # this will return the total conversation duration for each interval
    conversation = load_conversation(uid)
    data['conversation dur'] = data['interval'].apply(lambda x: conversation_in_range(x, conversation, 
                                                                           'start_timestamp', ' end_timestamp'))
    data['conversation dur'] = data['conversation dur'].apply(convert_timedeltas)
    
    # find the total darkness duration for each interval
    darkness = load_darkness(uid)
    data['darkness dur'] = data['interval'].apply(lambda x: conversation_in_range(x, darkness, 'start', 'end'))
    data['darkness dur'] = data['darkness dur'].apply(convert_timedeltas)
    
    
    # find the number of bluetooth colocations in each interval
    bluetooth = load_bluetooth(uid)
    data['bluetooth colocations'] = data['interval'].apply(lambda x: activity_in_range(x, bluetooth, 'count'))
    
    
    # find the location variance in each stress interval. 
    gps = load_gps(uid)
    data['location variance'] = data['interval'].apply(lambda x: activity_in_range(x, gps, 'location variance'))
    
    # wifi locations
    wifi_locations = load_wifi_locations(uid)
    data['location changes'] = data['interval'].apply(lambda x: activity_in_range(x, wifi_locations, 'location changes'))
    
    #load deadlines data.
    deadlines = deadlines_processing()
    #deadlines = deadlines[deadlines['doy' == data.start_day]]
    deadlines = deadlines[[uid, 'doy']]
    data = pd.merge(data, deadlines, on='doy', how='inner')
    data = data.rename(columns={uid: 'deadlines'})
  
    # drop Nan values
    data = data.dropna()
    
    # only use these features if we have over 50 datapoints
    if data.shape[0] < 20: 
        return None
    
    data.sort_values(by=['resp_time'], inplace=True)

    
    # return relevant columns. 
    return data[['resp_time', desired_column, 'location changes', 'activity dur',
                'conversation dur', 'darkness dur', 'bluetooth colocations', 'location variance', 'deadlines', 
                'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 
                'day', 'evening', 'night', 'pre midterm', 'in midterm', 'post midterm']]

In [162]:
def save_aggregated_data(windows, ema, desired_column, before=False):
    
    df = pd.DataFrame()
    
    ema_files = glob.glob('dataset/EMA/response/' + ema + '/' + ema + '_*.json')
    uid_start = len('dataset/EMA/response/' + ema + '/' + ema + '_')
    
    # loops through all the files and averages the feature importance lists
    for file in ema_files: 
        uid = file[uid_start:uid_start+3]
        for window in windows: 
            try: 
                data = ema_intervals_data(uid, window, ema, desired_column, before)
            except Exception as e:
                print(e)
                print(uid)
                continue
            if data is None:
                continue
            data['uid'] = uid
            data['window'] = window
            df = df.append(data)
            
    df.to_csv('ema_data\{}, before = {}.csv'.format(ema, before))
    
    return df
    
x = save_aggregated_data([2, 4, 6, 8, 10, 12], 'PAM', 'picture_idx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


"['u00'] not in index"
u00
"['u00'] not in index"
u00
"['u00'] not in index"
u00
"['u00'] not in index"
u00
"['u00'] not in index"
u00
"['u00'] not in index"
u00
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['Monday', 'in midterm', 'Tuesday', 'post midterm'] not in index"
u05
"['in midterm', 'post midterm'] not in index"
u20
"['in midterm', 'post midterm'] not in index"
u20
"['in midterm', 'post midterm'] not in index"
u20
"['in midterm', 'post midterm'] not in index"
u20
"['in midterm', 'post midterm'] not in index"
u20
"['in midterm', 'post midterm'] not in index"
u20
"['post midterm'] not in index"
u23
"['post midterm'] not in index"
u23
"['post midterm'] not in index"
u23
"['p

In [165]:
data = pd.read_csv('ema_data/PAM, before = False.csv')
data

Unnamed: 0.1,Unnamed: 0,resp_time,picture_idx,location changes,activity dur,conversation dur,darkness dur,bluetooth colocations,location variance,deadlines,...,Saturday,Sunday,day,evening,night,pre midterm,in midterm,post midterm,uid,window
0,10,2013-03-27 04:45:45,4,10.0,0.0,413.0,7633.0,3,-17.356564,0,...,0,0,0,0,1,1,0,0,u01,2
1,9,2013-03-27 04:45:46,2,10.0,0.0,413.0,7634.0,3,-17.356564,0,...,0,0,0,0,1,1,0,0,u01,2
2,5,2013-03-27 04:45:47,1,10.0,0.0,413.0,7635.0,3,-17.356564,0,...,0,0,0,0,1,1,0,0,u01,2
3,11,2013-03-27 07:03:27,3,10.0,0.0,0.0,14400.0,1,-17.499824,0,...,0,0,0,0,1,1,0,0,u01,2
4,13,2013-03-27 07:50:00,1,13.0,0.0,0.0,14400.0,0,-17.613747,0,...,0,0,0,0,1,1,0,0,u01,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42337,417,2013-05-31 09:55:34,4,140.0,11232.0,28811.0,25177.0,175,-1.381905,0,...,0,0,0,0,1,0,0,1,u59,12
42338,414,2013-05-31 18:07:45,4,62.0,15509.0,21340.0,30135.0,123,0.668664,0,...,0,0,0,1,0,0,0,1,u59,12
42339,415,2013-05-31 20:00:38,1,62.0,15499.0,21269.0,27664.0,112,0.708999,0,...,0,0,0,1,0,0,0,1,u59,12
42340,419,2013-06-01 00:23:29,4,62.0,15499.0,21269.0,11893.0,85,0.774796,0,...,1,0,0,0,1,0,0,1,u59,12


In [2]:
def convert_stress(level):
    """
    converts input stress level from the scale above into a more usable scale with 1 being feeling great 
    and 5 being stressed out.
    """
    # little stress = 3/5 stressed
    if level == 1: 
        return 3
    # definitely stressed = 4/5
    if level == 2:
        return 4
    # stressed out = 5/5
    if level == 3:
        return 5
    # feeling good = 2/5
    if level == 4: 
        return 2
    # feeling great = 1/5 
    if level == 5:
        return 1
    else:
        return 0

In [3]:
def convert_PAM(level):
    """
    assigns PAM picture_idx levels to four ranges (four quadrants):
    Quadrant 1: negative valence and low arousal; Quadrant 2: negative valence and high arousal; 
    Quadrant 3: positive valence and low arousal; Quadrant 4: positive valence and high arousal
    """
    quadrant_1 = list(range(1,5))
    quadrant_2 = list(range(5,9))
    quadrant_3 = list(range(9,13))
    quadrant_4 = list(range(13,17))
    
    if level in quadrant_1:
        return 1
    if level in quadrant_2: 
        return 2
    if level in quadrant_3: 
        return 3
    if level in quadrant_4: 
        return 4

In [4]:
def process_ema(uid, ema_name, desired_column): 
    """
    input: uid for which we want to process the EMA. 
           the name of the ema we want to process
           the column that represents the scoring area of interest for the particular ema. 
    output: 
        a dataframe containing the response time and score for each ema response. 
    """
    
    ema = pd.read_json('dataset/EMA/response/{}/{}_{}.json'.format(ema_name, ema_name, uid))
    
    # this takes the desired values that could be in the "null" column and puts them into the desired colum 
    try: 
        ema[desired_column] = ema[desired_column].where(np.isfinite, ema.null)
    except: 
        pass
    
    # get rid of the non-numeric answers from the null column.  
    ema[desired_column] = pd.to_numeric(ema[desired_column], errors='coerce')
    
    ema = ema[['resp_time', desired_column]]
    ema = ema.dropna()
    
    if ema_name == 'stress' or ema_name == 'Stress': 
        ema['level'] = ema['level'].apply(convert_stress)
        
    if ema_name == 'PAM':
        ema['picture_idx'] = ema['picture_idx'].apply(convert_PAM)
    
    return ema

In [5]:
def get_skewness(uid, ema, desired_column, start, stop, step = 1): 
    
    x = process_ema(uid, ema, desired_column)
    df = pd.DataFrame()
    
    for i in range(start, stop + step, step): 
        val = x[x[desired_column] == i].shape[0]
        df['response {}'.format(i)] = [val]
        
    df['uid'] = uid 
    
    df['total'] = x.shape[0]
    
    return df

In [6]:
get_skewness('u00', 'stress', 'level', 1, 5)

Unnamed: 0,response 1,response 2,response 3,response 4,response 5,uid,total
0,3,16,32,14,11,u00,76


In [7]:
def get_all_skewness(ema, desired_column, start, stop, step=1): 
    
    total_data = pd.DataFrame()
    
    ema_files = glob.glob('dataset/EMA/response/' + ema + '/' + ema + '_*.json')
    uid_start = len('dataset/EMA/response/' + ema + '/' + ema + '_')
    # loops through all the files and averages the feature importance lists
    for file in ema_files: 
        uid = file[uid_start:uid_start+3]
        try: 
            data = get_skewness(uid, ema, desired_column, start, stop, step)
        except Exception as e:
            continue
        total_data = total_data.append(data, ignore_index = True)

    
    total_data.loc['Total'] = total_data.sum(numeric_only = True)
    return total_data

In [273]:
y=get_all_skewness('stress', 'level', 1, 5)
y[y['total'] > 70]

Unnamed: 0,response 1,response 2,response 3,response 4,response 5,uid,total
0,3.0,16.0,32.0,14.0,11.0,u00,76.0
7,0.0,3.0,40.0,31.0,17.0,u08,91.0
9,3.0,42.0,50.0,12.0,1.0,u10,108.0
13,0.0,12.0,41.0,25.0,32.0,u16,110.0
16,1.0,10.0,37.0,31.0,13.0,u19,92.0
33,1.0,6.0,44.0,5.0,23.0,u43,79.0
34,2.0,23.0,52.0,7.0,6.0,u44,90.0
47,20.0,89.0,133.0,20.0,7.0,u59,269.0
Total,125.0,472.0,1013.0,369.0,309.0,,2288.0


In [198]:
x = get_all_skewness('PAM', 'picture_idx', 1, 4)
x
#x[['response 1, response 2', 'response 3', 'response 4']].hist()

Unnamed: 0,response 1,response 2,response 3,response 4,uid,total
0,41.0,192.0,83.0,74.0,u00,390.0
1,15.0,36.0,38.0,51.0,u01,140.0
2,32.0,77.0,71.0,42.0,u02,222.0
3,30.0,36.0,20.0,24.0,u03,110.0
4,38.0,65.0,65.0,50.0,u04,218.0
5,3.0,13.0,6.0,8.0,u05,30.0
6,11.0,55.0,37.0,44.0,u07,147.0
7,37.0,70.0,74.0,61.0,u08,242.0
8,3.0,6.0,4.0,8.0,u09,21.0
9,15.0,105.0,119.0,74.0,u10,313.0


In [309]:
uid_list = list(x[x['total'] > 200].uid.dropna().values)

In [173]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import confusion_matrix

In [174]:
u59_pam = process_ema('u59', 'PAM', 'picture_idx')
get_skewness('u59', 'PAM', 'picture_idx', 1, 4)

Unnamed: 0,response 1,response 2,response 3,response 4,uid,total
0,48,186,80,123,u59,437


In [235]:
data = pd.read_csv('ema_data/PAM, before = False.csv')
data = data[data['uid'] == 'u59']
data = data[data['window'] == 10]

feature_names = ['conversation dur', 'darkness dur', 'activity dur',
                 'location changes', 'location variance', 'bluetooth colocations']

accuracy_df = pd.DataFrame()


for feature_name in feature_names: 
    
    #print(feature_name)
    features = data[feature_name].values
    features = features.reshape(-1, 1)
    target = data['picture_idx'].values
    
    accuracy, conf = tscv_smote(features, target, RandomForestClassifier())
    #print(conf)
    p_value, scores_df, best_model = validate_model(features, target)
    diff = scores_df['{} score'.format(best_model)].mean() - scores_df['corrupted {} score'.format(best_model)].mean()

    """X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 0)

    sm = SMOTE(sampling_strategy='not majority', random_state = 0)
    X_new_train, y_new_train = sm.fit_resample(X_train, y_train)"""

    
    """rf.fit(X_train, y_train)
    print('accuracy without SMOTE: {:.4f}'.format(rf.score(X_test, y_test)))
    print(confusion_matrix(y_test, rf.predict(X_test), labels=[1,2,3,4]))"""
    
    """rf = RandomForestClassifier()
    rf.fit(X_new_train, y_new_train)
    accuracy = rf.score(X_test, y_test)
    print('accuracy with SMOTE: {:.4f}'.format(accuracy))
    print(confusion_matrix(y_test, rf.predict(X_test), labels=[1,2,3,4]))"""
    
    
    accuracy_df = accuracy_df.append(pd.DataFrame({'accuracy': [accuracy], 'feature': feature_name, 'p_value': p_value, 
                                                  'best_model': best_model, 'scoring difference': diff}), ignore_index = True)
    
accuracy_df

   et score  rf score  corrupted et score  corrupted rf score
0  0.220290  0.214493            0.269565            0.252174
1  0.231884  0.217391            0.234783            0.231884
2  0.217391  0.214493            0.286957            0.292754
3  0.211594  0.214493            0.257971            0.252174
4  0.214493  0.214493            0.249275            0.231884
5  0.220290  0.217391            0.231884            0.228986
6  0.223188  0.214493            0.257971            0.252174
7  0.211594  0.214493            0.255072            0.249275
8  0.214493  0.217391            0.275362            0.275362
9  0.223188  0.214493            0.263768            0.266667
   et score  rf score  corrupted et score  corrupted rf score
0  0.228986  0.255072            0.220290            0.223188
1  0.240580  0.252174            0.289855            0.292754
2  0.231884  0.249275            0.260870            0.278261
3  0.228986  0.255072            0.252174            0.246377
4  0.237

Unnamed: 0,accuracy,feature,p_value,best_model,scoring difference
0,0.214493,conversation dur,7e-06,rf,-0.037971
1,0.246377,darkness dur,0.069551,rf,-0.012174
2,0.269565,activity dur,0.332365,rf,0.003188
3,0.223188,location changes,2e-06,rf,-0.069565
4,0.255072,location variance,0.477291,rf,-0.00087
5,0.272464,bluetooth colocations,0.072685,rf,-0.011304


In [312]:
def evaluate_features(feature_list, data, target_column, confusion_matrices = False): 
    
    accuracy_df = pd.DataFrame()
    
    for feature_name in feature_list: 
    
        features = data[feature_name].values
        
        if len(features.shape) == 1: 
            features = features.reshape(-1, 1)
            
        target = data[target_column].values

        accuracy, conf = tscv_smote(features, target, RandomForestClassifier())
        
        if confusion_matrices is True: 
            print(conf)
            
        p_value, scores_df, best_model = validate_model(features, target)
        diff = scores_df['{} score'.format(best_model)].mean() - scores_df['corrupted {} score'.format(best_model)].mean()

        accuracy_df = accuracy_df.append(pd.DataFrame({'accuracy': [accuracy], 'feature': [feature_name], 'p_value': p_value, 
                                                      'best_model': best_model, 'scoring differencevs randomized': diff}), 
                                         ignore_index = True)
    
    return accuracy_df

In [287]:
feature_names = [['darkness dur', 'activity dur']]

data = pd.read_csv('ema_data/PAM, before = False.csv')
data = data[data['uid'] == 'u59']
data = data[data['window'] == 10]

evaluate_features(feature_names, data, 'picture_idx')

['darkness dur', 'activity dur']
       Unnamed: 0            resp_time  picture_idx  location changes  \
41508          42  2013-03-27 03:46:23            4              49.0   
41509          41  2013-03-27 04:11:27            2              59.0   
41510          40  2013-03-27 04:40:53            3              59.0   
41511          48  2013-03-27 09:39:42            4             105.0   
41512          45  2013-03-27 16:24:23            2             119.0   
...           ...                  ...          ...               ...   
41920         417  2013-05-31 09:55:34            4             131.0   
41921         414  2013-05-31 18:07:45            4              62.0   
41922         415  2013-05-31 20:00:38            1              62.0   
41923         419  2013-06-01 00:23:29            4              62.0   
41924         421  2013-06-01 04:03:15            2              20.0   

       activity dur  conversation dur  darkness dur  bluetooth colocations  \
41508       

Unnamed: 0,accuracy,feature,p_value,best_model,scoring differencevs randomized
0,0.292754,"[darkness dur, activity dur]",0.026966,rf,0.015362


In [224]:
def tscv_smote(features, target, model):
    
    tscv = TimeSeriesSplit(n_splits = 5)
    avg_score = 0
    
    labels = []
    predictions = []
    
        
    for train_index, test_index in tscv.split(features): 
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = target[train_index], target[test_index]
        
        sm = SMOTE(sampling_strategy='not majority', k_neighbors=2, random_state = 0)
        X_new_train, y_new_train = sm.fit_resample(X_train, y_train)
       
        model.fit(X_new_train, y_new_train)
        score = model.score(X_test, y_test)
        #print("train indices: {}, test indices: {}, score: {:.2f}".format(train_index, test_index, score))
        #print("predictions: {}, actual: {}".format(model.predict(X_test), target[test_index]))
        avg_score += score/5
        
        y_pred = model.predict(X_test)
        
        labels.extend(y_test)
        predictions.extend(y_pred)
        
        x = confusion_matrix(labels, predictions, labels=[1,2,3,4])
    
    df = pd.DataFrame()
    for i in range(x.shape[0]): 
        if i == 0: 
            df = pd.DataFrame({i+1: x[:,i]})
        else: 
            df = df.join(pd.DataFrame({i+1: x[:,i]}))
    df.index = [1, 2, 3, 4]
            
    return avg_score, df

In [265]:
def validate_model(features, target):
    
    #data = ema_intervals_data(uid, window, ema_name, desired_column)
    
    extra_trees = ExtraTreesClassifier()
    random_forest = RandomForestClassifier()
    
    et_scores = []
    rf_scores = []
    
    tscv = TimeSeriesSplit(n_splits = 5)
    
   
    for i in range(10): 
    
        et_scores.append(tscv_smote(features, target, extra_trees)[0])
        rf_scores.append(tscv_smote(features, target, random_forest)[0])
                         
    cor_et_scores = []
    cor_rf_scores = []
    
    for i in range(10): 
        
        np.random.shuffle(target)
    
        cor_et_scores.append(tscv_smote(features, target, extra_trees)[0])
        cor_rf_scores.append(tscv_smote(features, target, random_forest)[0])
    

    scores_df = pd.DataFrame({'et score': et_scores, 
                              'rf score': rf_scores, 
                              'corrupted et score': cor_et_scores, 
                              'corrupted rf score': cor_rf_scores, 
                              })
    
    rf_p_value = scipy.stats.ttest_ind(scores_df['rf score'], scores_df['corrupted rf score'])[1]
    et_p_value = scipy.stats.ttest_ind(scores_df['et score'], scores_df['corrupted et score'])[1]
    
    #return (1, 2, 3)
    
    if rf_p_value >= et_p_value: 
        return rf_p_value/2, scores_df[['rf score', 'corrupted rf score']], 'rf'
    else: 
        return et_p_value/2, scores_df[['et score', 'corrupted et score']], 'et'

In [194]:
data = pd.read_csv('ema_data/PAM, before = False.csv')
data = data[data['uid'] == 'u59']
data = data[data['window'] == 2]
data

Unnamed: 0.1,Unnamed: 0,resp_time,picture_idx,location changes,activity dur,conversation dur,darkness dur,bluetooth colocations,location variance,deadlines,...,Saturday,Sunday,day,evening,night,pre midterm,in midterm,post midterm,uid,window
39844,42,2013-03-27 03:46:23,4,8.0,13.0,192.0,0.0,5,-16.377472,0,...,0,0,0,0,1,1,0,0,u59,2
39845,41,2013-03-27 04:11:27,2,8.0,15.0,192.0,0.0,6,-16.567959,0,...,0,0,0,0,1,1,0,0,u59,2
39846,40,2013-03-27 04:40:53,3,8.0,19.0,192.0,1203.0,8,-16.854579,0,...,0,0,0,0,1,1,0,0,u59,2
39847,48,2013-03-27 09:39:42,4,0.0,0.0,0.0,14400.0,12,-21.555614,0,...,0,0,0,0,1,1,0,0,u59,2
39848,45,2013-03-27 16:24:23,2,29.0,3358.0,6419.0,9458.0,38,-11.625681,0,...,0,0,1,0,0,1,0,0,u59,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40253,418,2013-05-31 04:07:48,2,27.0,920.0,7234.0,0.0,25,-12.396650,0,...,0,0,0,0,1,0,0,1,u59,2
40254,417,2013-05-31 09:55:34,4,0.0,0.0,0.0,14400.0,25,-21.612672,0,...,0,0,0,0,1,0,0,1,u59,2
40255,414,2013-05-31 18:07:45,4,49.0,7086.0,9443.0,0.0,14,-1.809354,0,...,0,0,0,1,0,0,0,1,u59,2
40256,415,2013-05-31 20:00:38,1,20.0,5436.0,7469.0,0.0,25,-0.254406,0,...,0,0,0,1,0,0,0,1,u59,2


In [291]:
### Loneliness Scale Survey Data Treatment. Also adapted from the other student's notebook

def lonely_analysis(lonely):
    """
    Consolidates the  block of code necessary to generate the Loneliness survey visualizations for
    added modularity of notebook. Running it on the raw Loneliness data prepares the graphs related
    to this piece of the dataset.
    
    @param: lonely – raw data for Loneliness survey, obtained by using pandas' read_csv method
    
    returns: returns remodeled dataframes for the pre- and post-study halves of the original dataframe
             as a tuple for integrated visualizations with other studies. 
             Prepares graphs for Loneliness survey visualization.
             plt.show() should be run outside of function call for visualization
    """
    pre_lonely = lonely[lonely.type == 'pre'].drop('type', axis=1)
    post_lonely = lonely[lonely.type == 'post'].drop('type', axis=1)

    def remodel_columns_lonely(data):
        """
        Replaces the wordy columns for indices q1-q20. Since LonelinessScale
        is standardized, all questions follow the same order and
        can be referred to by indices for simplification.

        @param: data – dataframe containing PSS survey data

        returns: modified dataframe with q1-q20 indexed columns
        """
        index_dict = {}
        for ind in range(data.shape[1]):
            index_dict[data.columns[ind]] = f"q{ind + 1}"

        data = data.rename(columns=index_dict)
        return data


    def numerify_data_lonely(entry):
        """
        Replaces string response for corresponding value 1-4.

        @param: dataframe entry containing LonelinessScale survey answer

        returns: value 1-4 replacing str answer
        """
        if entry == 'Never':
            return 1
        if entry == 'Rarely':
            return 2
        if entry == 'Sometimes':
            return 3
        if entry == 'Often':
            return 4
        return entry

    def remodel_data_lonely(data):
        """
        Combines functionalities of remodel_columns_lonely and numerify_data_lonely
        for each entry, offering a dataframe more suitable for analysis. Also
        adds the test score for each student as a new column.

        @param: data – dataframe containing PSS survey data

        returns: modified dataframe with q1-q20 indexed columns and values 1-4
        replacing original str answers in q1-q20, with new column 'score'
        with each student's test score
        """
        data = remodel_columns_lonely(data)
        data = data.applymap(numerify_data_lonely)
        for question in {'q1', 'q5', 'q6', 'q9', 'q10',
                         'q15', 'q16', 'q19', 'q20'}:
            data[question] = data[question].apply(lambda x: 5 - x)
        data['score'] = data.sum(axis=1, numeric_only=True)
        data['id'] = data.index
        return data

    pre_lonely_m = remodel_data_lonely(pre_lonely)
    post_lonely_m = remodel_data_lonely(post_lonely)
    
    return pre_lonely_m, post_lonely_m

###All the following survey processing code is adapted from the notebook "Survey Dataset V2" 
### which was written by another student

### This function processes perceived stress scale

def pss_analysis(pss_survey):
    """
    Consolidates the  block of code necessary to generate the PSS survey visualizations for
    added modularity of notebook. Running it on the raw PSS data prepares the graphs related
    to this piece of the dataset.
    
    @param: pss_survey – raw data for PSS survey, obtained by using pandas' read_csv method
    
    returns: returns remodeled dataframes for the pre- and post-study halves of the original dataframe
             as a tuple for integrated visualizations with other studies. 
             Prepares graphs for PSS survey visualization.
             plt.show() should be run outside of function call for visualization
    """
    pre_pss = pss_survey[pss_survey.type == 'pre'].drop('type', axis=1)
    post_pss = pss_survey[pss_survey.type == 'post'].drop('type', axis=1)

    def remodel_columns_pss(data):
        """
        Replaces the wordy columns for indices q1-q10. Since PSS
        is standardized, all questions follow the same order and
        can be referred to by indices for simplification.

        @param: data – dataframe containing PSS survey data

        returns: modified dataframe with q1-q10 indexed columns
        """
        index_dict = {}
        for ind in range(data.shape[1]):
            index_dict[data.columns[ind]] = f"q{ind + 1}"

        data = data.rename(columns=index_dict)
        return data


    def numerify_data_pss(entry):
        """
        Replaces string response for corresponding value 0-4.

        @param: dataframe entry containing PSS survey answer

        returns: value 0-4 replacing str answer
        """
        if entry == 'Never':
            return 0
        if entry == 'Almost never':
            return 1
        if entry == 'Sometime':
            return 2
        if entry == 'Fairly often':
            return 3
        if entry == 'Very often':
            return 4
        return entry

    def remodel_data_pss(data):
        """
        Combines functionalities of remodel_columns_pss and numerify_data_pss
        for each entry, offering a dataframe more suitable for analysis. Also
        adds the test score for each student as a new column.

        @param: data – dataframe containing PSS survey data

        returns: modified dataframe with q1-q10 indexed columns and values 0-4
        replacing original str answers in q1-q10, with new columns 'score'
        with each student's test score
        """
        data = remodel_columns_pss(data)
        data = data.applymap(numerify_data_pss)
        # Reverse scoring for particular questions
        for question in {'q4', 'q5', 'q7', 'q8'}:
            data[question] = data[question].apply(lambda x: 4 - x)
        data['score'] = data.sum(axis=1, numeric_only=True)
        data['id'] = data.index
        return data

    pre_pss_m = remodel_data_pss(pre_pss)
    post_pss_m = remodel_data_pss(post_pss)

    return pre_pss_m, post_pss_m

### PHQ-9 Survey Data Treatment  

def phq_analysis(phq_survey):
    """
    Consolidates the  block of code necessary to generate the PHQ-9 survey visualizations for
    added modularity of notebook. Running it on the raw PHQ-9 data prepares the graphs related
    to this piece of the dataset.
    
    @param: phq_survey – raw data for PHQ-9 survey, obtained by using pandas' read_csv method
    
    returns: returns remodeled dataframes for the pre- and post-study halves of the
             original dataframe as a tuple for integrated visualizations with other studies. 
    """
    pre_phq = phq_survey[phq_survey.type == 'pre'].drop('type', axis=1)
    post_phq = phq_survey[phq_survey.type == 'post'].drop('type', axis=1)

    def remodel_columns_phq(data):
        """
        Replaces the wordy columns for indices q1-q10. Since PHQ-9
        is standardized, all questions follow the same order and
        can be referred to by indices for simplification.

        @param: data – dataframe containing PHQ-9 survey data

        returns: modified dataframe with q1-q10 indexed columns
        """
        index_dict = {}
        for ind in range(data.shape[1]):
            index_dict[data.columns[ind]] = f"q{ind + 1}"

        data = data.rename(columns=index_dict)
        return data


    def numerify_data_phq(entry):
        """
        Replaces string response for corresponding value 0-3.

        @param: entry – dataframe entry containing PHQ-9 survey answer

        returns: value 0-3 replacing str answer; for q10, simply returns same
        str entry (column q10 is not graded)
        """
        if entry == 'Not at all':
            return 0
        if entry == 'Several days':
            return 1
        if entry == 'More than half the days':
            return 2
        if entry == 'Nearly every day':
            return 3
        return entry

    def severity_analysis_phq(score):
        """
        Classifies each student's score according to the PHQ-9 classification standard
        
        @param: data.score – 'score' column of dataframe
        
        returns: new column which can be assigned to new label 'severity_level'
        """
        if score <= 4:
            return 'normal'
        if score <= 9:
            return 'mild'
        if score <= 14:
            return 'moderate'
        if score <= 19:
            return 'moderately severe'
        return 'severe'

    def remodel_data_phq(data):
        """
        Combines functionalities of remodel_columns_phq and numerify_data_phq
        for each entry, offering a dataframe more suitable for analysis. Also
        adds the test score for each student as a new column.

        @param: data – dataframe containing PHQ-9 survey data

        returns: modified dataframe with q1-q10 indexed columns and values 0-3
        replacing original str answers in q1-q9, with new columns 'score' and
        'severity_level' with each student's test score and classification.
        """
        data = remodel_columns_phq(data)
        data = data.applymap(numerify_data_phq)
        data['score'] = data.sum(axis=1, numeric_only=True)
        data['severity_level'] = data.score.apply(severity_analysis_phq)
        data['id'] = data.index
        return data

    pre_phq_m = remodel_data_phq(pre_phq)
    post_phq_m = remodel_data_phq(post_phq)

    return pre_phq_m, post_phq_m

pss = pd.read_csv("dataset/survey/PerceivedStressScale.csv", index_col=0)
prepss, postpss = pss_analysis(pss)
    
loneliness = pd.read_csv("dataset/survey/LonelinessScale.csv", index_col=0)
prelonely, postlonely = lonely_analysis(loneliness)
    
phq = pd.read_csv("dataset/survey/PHQ-9.csv", index_col=0)
pre_phq, post_phq = phq_analysis(phq)

In [299]:
from sklearn.cluster import AgglomerativeClustering

def clustering(uid_list, n_clusters): 
    """
    inputs: compiled_features containing survey scores and sensor rankings. 
    
    this function performs agglomerative clustering with 4 groups. 
    """    
    pss = pd.read_csv("dataset/survey/PerceivedStressScale.csv", index_col=0)
    prepss, postpss = pss_analysis(pss)

    loneliness = pd.read_csv("dataset/survey/LonelinessScale.csv", index_col=0)
    prelonely, postlonely = lonely_analysis(loneliness)

    phq = pd.read_csv("dataset/survey/PHQ-9.csv", index_col=0)
    pre_phq, post_phq = phq_analysis(phq)

    clustering = AgglomerativeClustering(n_clusters = n_clusters)
    
    survey_list = [(prepss, 'pss'), (prelonely, 'loneliness'), (pre_phq, 'phq')]
    
    survey_df = pd.DataFrame({'id':uid_list})
    
    for survey in survey_list:     
        survey_df = survey_df.merge(survey[0][['score', 'id']], on = 'id', how = 'inner')
        survey_df = survey_df.rename(columns = {'score': survey[1] + ' score'})
        
    clustering.fit(survey_df[['pss score', 'loneliness score', 'phq score']])
    
    survey_df['cluster'] = clustering.labels_
    
    return survey_df

In [301]:
clustering(['u10', 'u16', 'u19', 'u57', 'u58', 'u59'], 2)

Unnamed: 0,id,pss score,loneliness score,phq score,cluster
0,u10,20.0,64,0,1
1,u16,24.0,35,6,0
2,u19,20.0,55,5,1
3,u57,9.0,44,0,0
4,u58,20.0,51,5,1
5,u59,18.0,37,5,0


In [310]:
uid_list

['u00',
 'u02',
 'u04',
 'u08',
 'u10',
 'u12',
 'u16',
 'u17',
 'u19',
 'u22',
 'u32',
 'u33',
 'u35',
 'u36',
 'u43',
 'u44',
 'u49',
 'u51',
 'u52',
 'u53',
 'u57',
 'u58',
 'u59']

In [None]:
feature_names = ['conversation dur', 'darkness dur', 'activity dur',
                 'location changes', 'location variance', 'bluetooth colocations', ['darkness dur', 'activity dur'], 
                ['conversation dur', 'darkness dur', 'activity dur',
                 'location changes', 'location variance', 'bluetooth colocations']]

data = pd.read_csv('ema_data/PAM, before = False.csv')
#data = data[data['uid'].isin(uid_list)]
data = data[data['window'] == 10]

evaluate_features(feature_names, data, 'picture_idx')