## What is this notebook about?
This notebook summarizes general information about the donations that are illustrated in Table 3 of the paper and referenced in the text. 

The numbers calculated here and other notebooks will be stored under the filepath indicated by the variable `data_reports_path` for future reference.

In [None]:
# Some imports to get things started
import sys
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
sys.path.insert(1, os.path.abspath('../'))
from utils import save_descriptive_stats, save_variables
from pathlib import Path


In [None]:
# Load the donation info 
donation_table = pd.read_csv(Path('../data/donation_table.csv'))

# Load messages including those from non-interactive chats
messages_table = pd.read_csv(Path('../data/messages_table.csv'))

# Load filtered messages only from interactive chats
messages_filtered_table = pd.read_csv(Path('../data/messages_filtered_table.csv'))

# Where to save numbers from the analysis
data_reports_fp = Path('../reports/numbers.dat')


# Empty dictionaries to store values for illustrating the table
WhatsApp_info = {} 
Facebook_info = {}

In [None]:
# Numbers for the first part of Table 3
for source, source_dict in zip(['WhatsApp','Facebook'],[WhatsApp_info,Facebook_info]):
    sourceIDs = donation_table[donation_table['source'] == source]['donation_id']
    source_messages = messages_table[messages_table['donation_id'].isin(sourceIDs)]
    
    #How many donations from each source were donated
    source_dict['N of donations'] = len(donation_table[donation_table['source'] == source]['donation_id'])
    
    #  How many chats were from each source
    source_dict['N of chats'] = len(source_messages.groupby('conversation_id'))
    
    # How many interactive chats from each source
    source_messages_filtered = messages_filtered_table[messages_filtered_table['donation_id'].isin(sourceIDs)]
    source_dict['N of interactive chats'] = len(source_messages_filtered.groupby('conversation_id'))

    # How many chats per donation summarized by descriptive stats
    chat_counts = source_messages.groupby('donation_id')['conversation_id'].nunique().reset_index()
    source_dict['Chats per person Median'] = float(chat_counts['conversation_id'].median())

    source_dict['Chats per person STD'] = float(chat_counts['conversation_id'].std())
    source_dict['Chats per person Range'] = f'{chat_counts["conversation_id"].min()}-{chat_counts["conversation_id"].max()}'

In [None]:
# Numbers for the second sand third parts of Table 3 
for source, source_dict in zip(['WhatsApp','Facebook'],[WhatsApp_info,Facebook_info]):
    sourceIDs = donation_table[donation_table['source'] == source]['donation_id']
    source_messages = messages_table[messages_table['donation_id'].isin(sourceIDs)]
    donation_message_counts = []
    donation_spans = [] 
     
    # Here we do calculations for each donation separately to derive sample statistics
    for donationID in sourceIDs:
        donation_messages = source_messages[source_messages['donation_id']==donationID]
        donation_message_counts.append(len(donation_messages))
        donation_messages['datetime'] = pd.to_datetime(donation_messages['datetime'])
        earliest_date = donation_messages['datetime'].min()
        latest_date = donation_messages['datetime'].max()
        date_difference = (latest_date - earliest_date).days        
        donation_spans.append(date_difference)

    source_dict['Donation timespan (Days) Median'] = np.median(donation_spans)
    source_dict['Donation timespan (Days) STD'] = np.std(donation_spans)
    source_dict['Donation timespan (Days) Min'] = np.min(donation_spans)
    source_dict['Donation timespan (Days) Max'] = np.max(donation_spans)


    source_dict['Donation message count Median'] = np.median(donation_message_counts)
    source_dict['Donation message count STD'] = np.std(donation_message_counts)
    source_dict['Donation message count Min'] = np.min(donation_message_counts)
    source_dict['Donation message count Max'] = np.max(donation_message_counts)

    # Save values to a file for later reference in the paper 
    for key, value in source_dict.items():
        try:
            value = np.round(value,2)
        except:
            pass
        try:
            # Shorten the digits for large numbers
            if value > 10000:
                value = f'{int(value/1000)}k'
                
        except:
            pass
        save_variables(data_reports_fp,f'{source}-{key}',value)


In [None]:
# Print the results
WhatsApp_df = pd.DataFrame(WhatsApp_info,index=['WhatsApp'])
Facebook_df = pd.DataFrame(Facebook_info,index=['Facebook'])
Table_3 = pd.concat([WhatsApp_df,Facebook_df]).T
print(Table_3)