### This notebook is used to derive metrics for quantifying the participants' speed.

Absolute speed refers to how fast the participants reply to their messages, while relative speed is used to compare their speed to their contacts' speed. For absolute speed assessments, the participants replied to the question "I reply to most of my messages within: < 1 min, 2-3 min, 3-5 min, 6-15 min, 16-30 min, 31-60 min, > 60 min.". They could select multiple bins. For relative speed question, the participants replied to "On average, I reply faster to my contacts than my contacts reply to me" on a 7-point-Likert scale (Disagree strongly = 1 ... 'Agree strongly = 7).There are two assessments: one before and one after seeing the visual feedback, referenced in the code by pre- and post.


In [1]:
import sys
import os
import pandas as pd
import numpy as np
import warnings
from pathlib import Path
warnings.filterwarnings("ignore")
sys.path.insert(1, os.path.abspath('../'))
sys.path.insert(1, os.path.abspath('../../..'))

raw_data_path = "../../data/raw"
processed_data_path = "../../data/processed"

### Load messaging data 

In [2]:
# Load the donation info from the data dable
donation_table = pd.read_csv(Path(f'{raw_data_path}/donation_table_CHB_filtered.csv'))

# Load donated messages from the relevant donations (e.g. those who filled in both surveys)
messages_table = pd.read_csv(Path(f'{raw_data_path}/messages_table_CHB_filtered.csv'))
messages_table['datetime'] = pd.to_datetime(messages_table['datetime']) # ensure the date is in datetime format
messages_table['datetime'] = messages_table['datetime'].dt.round('min')  # round down to the nearest minute

### Calculate the response probabilities of the participants (egos) and their contacts (alters) within the specified bins.

In [3]:
from modules.utils import bin_probability
from modules.metrics import response_times

bins = [[0, 60], [60, 3*60], [3*60, 6*60], [6*60, 16*60], [16*60, 31*60], [31*60, 60*60], [60*60, np.inf]]
bin_labels = ['speed-<1min', 'speed-1-2min', 'speed-3-5min', 'speed-6-15min', 'speed-16-30min', 'speed-31-60min', 'speed->60min']

info_dict = {}

for donation_id, donor_id, external_id in zip(donation_table['donation_id'], donation_table['donor_id'], donation_table['external_id']):
    donation_messages = messages_table[messages_table['donation_id'] == donation_id]
    
    # Collect all ego and alter response times across chats
    ego_speeds = []
    alter_speeds = []
    
    for _, chat in donation_messages.groupby('conversation_id'):
        ego_chat_speeds, alter_chat_speeds = response_times(chat, donor_id)
        ego_speeds.extend(ego_chat_speeds)
        alter_speeds.extend(alter_chat_speeds)
    
    # Compute bin probabilities
    ego_probs = bin_probability(ego_speeds, bins, bin_labels)
    alter_probs = bin_probability(alter_speeds, bins, bin_labels)
    
    # Cumulative sums 
    ego_cumsum = np.cumsum(list(ego_probs.values()))
    alter_cumsum = np.cumsum(list(alter_probs.values()))
    
    # Calculate the probability difference between ego and alters for responding within N minutes 
    # Deltas are calculated based on cumulative sum to be able to report "difference in response probability within N minutes"

    delta_dict = {f'{label}-delta': ego_val - alter_val for label, ego_val, alter_val in zip(bin_labels, ego_cumsum, alter_cumsum)}
    
    # Next, the probabilities are stored along with absolute response times. 
    # The latter follow power law distribution and are messy to work with though. 

    info = {**ego_probs, **delta_dict,
            'alter-ego-median-speed-delta': np.median(alter_speeds) - np.median(ego_speeds),
            'ego-median-speed': np.median(ego_speeds)}
    
    info_dict[external_id] = info

# Create final table
speed_table = pd.DataFrame.from_dict(info_dict, orient='index').reset_index().rename(columns={'index': 'external_id'})

### Absolute speed saving

Combine the speed metrics with the donor self-reports and save it all in a table for later analysis

In [6]:
pre_survey = pd.read_excel(Path(f'{raw_data_path}/pre-survey_CHB.xlsx'))[['external_id']+bin_labels]
post_survey = pd.read_excel(Path(f'{raw_data_path}/post-survey_CHB.xlsx'))[['external_id']+bin_labels]

# Map the strings to numerical values
bin_response_mapping = {'Yes': 1, 'No': 0}
for col in bin_labels:
    pre_survey[f'{col}'] = pre_survey[f'{col}'].map(bin_response_mapping)
    post_survey[f'{col}'] = post_survey[f'{col}'].map(bin_response_mapping)
    
absolute_survey = pd.merge(pre_survey, post_survey, on='external_id', how='inner', suffixes=('_pre', '_post'))
absolute_survey = absolute_survey.dropna(subset=[f"{col}_pre" for col in bin_labels] + [f"{col}_post" for col in bin_labels])
absolute_speed_data = pd.merge(absolute_survey,speed_table[['external_id']+bin_labels],on='external_id', how ='inner')
absolute_speed_data.to_excel(Path(f'{processed_data_path}/absolute_speed_data.xlsx'),index=False)

### Relative speed saving 
Combine the speed metrics with the donor self-reports and save it all in a table for later analysis

In [7]:
from modules.utils import map_7point_likert
question_column = 'faster_response'

# Load and transform the question columns relevant for this aspect
pre_survey = map_7point_likert(Path(f'{raw_data_path}/pre-survey_CHB.xlsx'), question_column) # makes sure Likert scale is in numerical form
post_survey = map_7point_likert(Path(f'{raw_data_path}/post-survey_CHB.xlsx'), question_column) # makes sure Likert scale is in numerical form
relative_survey = pd.merge(pre_survey, post_survey, on='external_id', how='inner', suffixes=('_pre', '_post'))
relative_survey[f'{question_column}_diff'] = relative_survey[f'{question_column}_post'] - relative_survey[f'{question_column}_pre']

# Pair objective data with the subjective assessments based on exterenal_id
all_data = pd.merge(relative_survey,speed_table,on='external_id', how ='inner')
all_data.to_excel(Path(f'{processed_data_path}/relative_speed_data.xlsx'),index=False)