In [1]:
# Some imports to get things started
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")
sys.path.insert(1, os.path.abspath('../'))
from utils import set_aspect,save_descriptive_stats, save_variables
from pathlib import Path

In [2]:
# Load the donation info from the data dable
donation_table = pd.read_csv(Path('../data/donation_table.csv'))

# Load messages including those from non-interactive chats
messages_table = pd.read_csv(Path('../data/messages_table.csv'))

# Load filtered messages only from interactive chats
messages_filtered_table = pd.read_csv(Path('../data/messages_filtered_table.csv'))
 
# Load the quesionnaire answers 
survey_info = pd.read_excel(Path('../data/survey.xlsx'))

# Where to save numbers from the analysis
data_reports_fp = Path('../reports/numbers.dat')

# Where to save figures from the analysis
figpath = Path('../reports')

# Which colors to use to plot data from different messengers
WhatsApp_color = '#3dc24f'
Facebook_color = '#385999'

In [4]:
source_group_chats = {'WhatsApp':None, 'Facebook':None}

for source,color in zip(['WhatsApp','Facebook'],[WhatsApp_color,Facebook_color]):
    sourceIDs = donation_table[donation_table['source'] == source]['donation_id']
    group_chats = [0 for i in range(len(sourceIDs))]
    total_chats = len(messages_table.groupby('conversation_id').size())
    for ind,donationID in enumerate(sourceIDs):
        # Get the donor_id for the donation to be able to separate the donor messages
        egoID = donation_table[donation_table['donation_id']==donationID]['donor_id'].iloc[0]
        
        # Separate messages for a given dontion
        #donation_messages = messages_table[messages_table['donation_id']==donationID]
        donation_messages = messages_filtered_table[messages_filtered_table['donation_id']==donationID]

        # Get chatIDs in a given donation
        chatIDs = donation_messages['conversation_id'].unique()
        for chatID in chatIDs:
            # Separate messages in a given chat
            chat_messages = donation_messages[donation_messages['conversation_id']==chatID]
            # Calculate messages per sender
            message_counts_per_sender = chat_messages.groupby('sender_id').size().reset_index(name='message_count')
            if len(message_counts_per_sender) == 3:
                # Some of our earlier donations didn't mark the "System messages" from WhatsApp, so we derived a heuristic to 
                # detect and remove them to avoid assigning group chats when the third "sender" is WhatsApp.
                # In our empirical evaluation, "sender" is the one with the least messages and 
                # 5 messages is a good threshold to filter these out. 
                min_messages = message_counts_per_sender['message_count'].min()
                if min_messages < 5:
                    pass
                else:
                    group_chats[ind]+=1
            elif len(message_counts_per_sender) >3:
                group_chats[ind]+=1
            else:
                pass
                
    source_group_chats[source] = group_chats
    save_descriptive_stats(data_reports_fp,f'{source}_group_filtered',source_group_chats[source])
    save_variables(data_reports_fp,f'{source}_group_total_filtered',np.sum(source_group_chats[source]))
    save_variables(data_reports_fp,f'{source}_group_total_filtered_percent',np.sum(source_group_chats[source])/total_chats*100)