# Processing Conversation Data Tutorial

In this notebook, we will go through a complete processing of the conversation data and making correlations between conversation data and phq-9 scores. 

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from numpy import *
import glob
import scipy 
from datetime import datetime
import sklearn
sns.style = 'darkgrid'

This concatenates the conversation data for all users, as previously shown in notebook 1. 

In [2]:
# this is a list of all conversation files
files = glob.glob("dataset/sensing/conversation/conversation_*.csv")

# this is the starting index of the uid in each filename
uid_start = len("dataset/sensing/conversation/conversation_")

# loop through all the files
for file in files: 
    
    # the uid is this slice of the filename
    uid = file[uid_start:uid_start+3]
        
    # we want to initialize the conversations dataframe for the first uid
    if uid == 'u00': 
        conversations = pd.read_csv(file)
        conversations['uid'] = uid
        continue
    
    # for other uid's, add to the conversations dataframe
    conv_df = pd.read_csv(file)
    conv_df['uid'] = uid
    conversations = pd.concat([conversations, conv_df], ignore_index = True)

# this is the dataframe containing the start and end time of conversation for every user. 
conversations

Unnamed: 0,start_timestamp,end_timestamp,uid
0,1364359600,1364359812,u00
1,1364382621,1364383065,u00
2,1364383516,1364384993,u00
3,1364385033,1364385094,u00
4,1364385786,1364385866,u00
...,...,...,...
79018,1370053418,1370053499,u59
79019,1370053540,1370053893,u59
79020,1370054035,1370054197,u59
79021,1370054278,1370055026,u59


Now, we'll transform the data into more useful metrics. We need the start and end hour for calculating epochs later

In [3]:
# the conversation length in seconds is the difference in timestamps
conversations['conversations length'] = conversations[' end_timestamp'] - conversations['start_timestamp']

# convert conversations start and end to standard datetime format
conversations['start_timestamp'] = pd.to_datetime(conversations['start_timestamp'], unit = 's')
conversations[' end_timestamp'] = pd.to_datetime(conversations[' end_timestamp'], unit = 's')

# make new columns organizing times by hour so we can sort by epoch
conversations['start hour'] = conversations['start_timestamp'].dt.hour
conversations['end hour'] = conversations[' end_timestamp'].dt.hour

# view the new transformed conversations dataframe
conversations

Unnamed: 0,start_timestamp,end_timestamp,uid,conversations length,start hour,end hour
0,2013-03-27 04:46:40,2013-03-27 04:50:12,u00,212,4,4
1,2013-03-27 11:10:21,2013-03-27 11:17:45,u00,444,11,11
2,2013-03-27 11:25:16,2013-03-27 11:49:53,u00,1477,11,11
3,2013-03-27 11:50:33,2013-03-27 11:51:34,u00,61,11,11
4,2013-03-27 12:03:06,2013-03-27 12:04:26,u00,80,12,12
...,...,...,...,...,...,...
79018,2013-06-01 02:23:38,2013-06-01 02:24:59,u59,81,2,2
79019,2013-06-01 02:25:40,2013-06-01 02:31:33,u59,353,2,2
79020,2013-06-01 02:33:55,2013-06-01 02:36:37,u59,162,2,2
79021,2013-06-01 02:37:58,2013-06-01 02:50:26,u59,748,2,2


We temporarily make the epoch column a tuple because we need a tuple of arguments for the epoch function we will right (start, end). we'll apply the epoch function to that tuple. 

In [8]:
# create a tuple of start and end time (in hours) in a new column so we can apply our next function to that column
conversations['epoch'] = list(zip(conversations['start hour'], conversations['end hour']))

def epoch(times_tuple):
    """
    input: tuple containing start and end times (in hours on 24 hour scale)
    output: the epoch that corresponds to the timestamps
    note: we chose to only return timestamps that had both start and end time within a single epoch. An alternative would
    be splitting conversations that span multiple epochs into two conversations, one in each epoch, but we decided not to 
    because that would double count conversations for each user. Some feedback on this step would be appreciated. 
    """
    start = times_tuple[0]
    end = times_tuple[1]
    
    # Day epoch: hours 10 am -6 pm
    if (start and end) > 10 and (start and end) <=18:
        return 'Day'
    # Night epoch: 12 am - 10 am
    elif (start and end)<=10:
        return 'Night'
    # evening epoch: 6 pm - 12 am
    elif (start and end) > 18:
        return 'Evening'

# apply the epoch function to create a new column which contains the epoch for every conversation
conversations['epoch'] = conversations['epoch'].apply(epoch)
conversations.head()

Unnamed: 0,start_timestamp,end_timestamp,uid,conversations length,start hour,end hour,epoch
0,2013-03-27 04:46:40,2013-03-27 04:50:12,u00,212,4,4,Night
1,2013-03-27 11:10:21,2013-03-27 11:17:45,u00,444,11,11,Day
2,2013-03-27 11:25:16,2013-03-27 11:49:53,u00,1477,11,11,Day
3,2013-03-27 11:50:33,2013-03-27 11:51:34,u00,61,11,11,Day
4,2013-03-27 12:03:06,2013-03-27 12:04:26,u00,80,12,12,Day


PHQ-9 processing, as shown in notebook 1. 

In [9]:
# read the phq-9 survey
phq9 = pd.read_csv("dataset/survey/PHQ-9.csv")

def score(response):
    """
    input: string representing the student's response to the phq-9 survey
    output: number representing that score on a scale from 0-3
    """
    if response == 'Not at all':
        return 0
    elif response == 'More than half the days':
        return 2
    elif response == 'Several days':
        return 1
    elif response == 'Nearly every day':
        return 3
    # if the response isn't one of these three, leave it as is
    else:
        return response

# convert each response to a score
phq9_nums = phq9.applymap(score)
# add the scores in each row to make a new column with total scores
phq9_nums['total'] = phq9_nums.sum(axis = 1, skipna = True)
phq9_nums 

Unnamed: 0,uid,type,Little interest or pleasure in doing things,"Feeling down, depressed, hopeless.","Trouble falling or staying asleep, or sleeping too much.",Feeling tired or having little energy,Poor appetite or overeating,Feeling bad about yourself or that you are a failure or have let yourself or your family down,"Trouble concentrating on things, such as reading the newspaper or watching television",Moving or speaking so slowly that other people could have noticed. Or the opposite being so figety or restless that you have been moving around a lot more than usual,"Thoughts that you would be better off dead, or of hurting yourself",Response,total
0,u00,pre,0,1,0,1,0,0,0,0,0,Not difficult at all,2
1,u01,pre,1,1,1,1,0,1,0,0,0,Very difficult,5
2,u02,pre,2,1,2,2,2,1,1,2,0,Somewhat difficult,13
3,u03,pre,0,1,0,0,0,0,0,1,0,Somewhat difficult,2
4,u04,pre,1,1,0,1,1,1,1,0,0,Somewhat difficult,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,u52,post,1,3,3,2,3,1,0,1,1,Extremely difficult,15
80,u53,post,0,1,3,2,3,0,2,0,0,Somewhat difficult,11
81,u56,post,0,0,0,2,0,0,1,0,0,Somewhat difficult,3
82,u58,post,0,1,0,2,2,1,1,1,0,Not difficult at all,8


Next, we will calculate correlations correlations between the conversation data and phq-9 results. To do this, we will use a lot of pandas function already covered along with scipy.stats.pearsonr for the correlation itself.

In [13]:
def correlate_conversations(epoch, phq_type, analysis_type):
    """
    inputs: 
        epoch: the section of the day to analyze
        phq_type: either pre or post, 
        analysis_type: either 'frequency' or 'duration', indicating a whether looking for total number of conversations or 
        total conversation duration
        bootstrap: boolean determining whether or not ot bootstrap the data before correlating
    outputs: correlation between conversations during the epoch 
    """
    # merge convos with the phq datframe
    conv_analysis = conversations.merge(phq9_nums[['total', 'uid']][phq9_nums.type == phq_type], on = 'uid', how = 'inner')
    
    # group the dataframe by id, only look at the selected epoch
    uid_groups = conv_analysis[conv_analysis['epoch'] == epoch].groupby('uid')
    
    # if we want duration, find the total conversation length
    if analysis_type == 'duration': 
        conv_lengths = uid_groups['conversations length'].sum()
        
    # if we want frequency, find the total number of conversations 
    if analysis_type == 'frequency':
        # this creates a dataframe with each row having the same count value, so i'll just take the first series
        conv_lengths = uid_groups.count()['start_timestamp']
    
    # take only one values from each student representing that student's phq score
    phq_data = uid_groups['total'].unique()
    phq_data = phq_data.apply(int)
    
    # make a dataframe with transformed conversation data and phq-data to correlate
    aggregate_df = pd.DataFrame({'conv': conv_lengths, 'phq': phq_data})
    
    # returns tuple of (correlation, p-value)
    return scipy.stats.pearsonr(conv_lengths, phq_data)

In [14]:
# display results in a dataframe
conv_dict = {'Frequency during day (pre)': correlate_conversations('Day', 'pre', 'frequency'),
             'Frequency during day (post)': correlate_conversations('Day', 'post', 'frequency'),
             'Frequency during evening (post)': correlate_conversations('Evening', 'post', 'frequency'),
             'Duration during day (post)': correlate_conversations('Day', 'post', 'duration')
}

conv_df = pd.DataFrame(conv_dict, index = ["pearson's correlation", "p-value"])
conv_df

Unnamed: 0,Frequency during day (pre),Frequency during day (post),Frequency during evening (post),Duration during day (post)
pearson's correlation,-0.207959,-0.12842,-0.276444,-0.16792
p-value,0.165489,0.442264,0.092937,0.313586


# Exercise
Now try using the above code as a template for calculating correlations + p-values for another sensor data and survey combination. Think of the best way to represent the sensor data as one value for each student (look at the papers for help). 

In [None]:
# your code here