### This notebook is used to derive metrics for chat equality index and combine them with the subjective scores. 

Chat equality is derived by the words sent by the data donor (participant) in each of their chats. There are different metrics but 1-Gini index (rGini) is the default. The subjective scores of this concept are assessed by asking the participant the following question "I send approximately the same number of words per month to all of my contacts", evaluated on a 7-point-Likert scale (Disagree strongly = 1 ... 'Agree strongly = 7).  There are two assessments: one before and one after seeing the visual feedback, referenced in the paper and code by pre- and post.

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
from pathlib import Path

warnings.filterwarnings("ignore")
sys.path.insert(1, os.path.abspath('../'))
sys.path.insert(1, os.path.abspath('../../..'))

raw_data_path = "../../data/raw"
processed_data_path = "../../data/processed"

### Load messaging data 

In [2]:
# Load the donation info from the data dable
donation_table = pd.read_csv(Path(f'{raw_data_path}/donation_table_CHB_filtered.csv'))

# Load donated messages from the relevant donations (e.g. those who filled in both surveys)
messages_table = pd.read_csv(Path(f'{raw_data_path}/messages_table_CHB_filtered.csv'))
messages_table['datetime'] = pd.to_datetime(messages_table['datetime']) # ensure the date is in datetime format

### Load the metrics for calculating the chat equality index, in the paper Rgini is the default, referred to as Chat Equality Index. 

In [3]:
from modules.metrics import calculate_rGini, shannon_entropy_equality, hhi_equality,plateau_score
metrics = {'gini': calculate_rGini,
    'entropy': shannon_entropy_equality,
    'hhi': hhi_equality,
    'plateau_score':plateau_score}

### Calculate the equality index for the entire donation period and for the last month to check for recency effects!

In [4]:
donationIDs = list(donation_table['donation_id'])
donor_info = {}

for donationID in donationIDs:
    external_id = donation_table[donation_table['donation_id']==donationID]['external_id'].iloc[0]
    donor_info[external_id] = {}
    
    # Get the donor_id for the donation and separate the donor messages
    egoID = donation_table[donation_table['donation_id']==donationID]['donor_id'].iloc[0]
    ego_messages = messages_table[messages_table['sender_id'] == egoID]
    # Words sent by donor in each of the chats
    ego_wc_per_chat = ego_messages.groupby('conversation_id')['word_count'].sum()      
    
    # Words sent by donor per chat per month
    ego_messages['year_month'] = pd.to_datetime(ego_messages['datetime']).dt.to_period('M')
    monthly_word_count = ego_messages.groupby(['year_month','conversation_id'])['word_count'].sum().reset_index()
    
    # Organize the table by conversation ID so that the converdationIDs are row titles and the months are the columns
    monthly_reordered = monthly_word_count.pivot(index='conversation_id', columns='year_month', values='word_count')
    monthly_reordered = monthly_reordered.fillna(0)

    # Separate the messages sent in the last month
    last_month_values = monthly_reordered.iloc[:, -1]

    # For each metric, use the corresponding function and store the values!
    for metric_name, metric_func in metrics.items(): 
        donor_info[external_id][f'Overall {metric_name}'] = metric_func(list(ego_wc_per_chat))
        donor_info[external_id][f'Median {metric_name}'] = np.median(monthly_reordered.apply(lambda row: metric_func(row.values.tolist()), axis=0))
        donor_info[external_id][f'Last month {metric_name}'] = metric_func(list(last_month_values))

### Combine the chat equality indices with the donor self-reports and save it all in a table for later analysis

In [5]:
from modules.utils import map_7point_likert
objective_table = pd.DataFrame.from_dict(donor_info, orient='index').reset_index().rename(columns={'index': 'external_id'})
question_column = 'same_wc_to_all'
analysis = 'equality'

# Load and transform the question columns relevant for this aspect
pre_survey = map_7point_likert(Path(f'{raw_data_path}/pre-survey_CHB.xlsx'), question_column) # makes sure Likert scale is in numerical form
post_survey = map_7point_likert(Path(f'{raw_data_path}/post-survey_CHB.xlsx'), question_column) # makes sure Likert scale is in numerical form
combined_survey = pd.merge(pre_survey, post_survey, on='external_id', how='inner', suffixes=('_pre', '_post'))

# Pair objective data with the subjective assessments based on external_id
all_data = pd.merge(combined_survey,objective_table,on='external_id', how ='inner')
all_data[f'{question_column}_diff'] = all_data[f'{question_column}_post'] - all_data[f'{question_column}_pre']
all_data.to_excel(Path(f'{processed_data_path}/chat_equality_data.xlsx',index=False))