In [1]:
import sys
import os
import warnings
warnings.filterwarnings("ignore")
sys.path.insert(1, os.path.abspath('../../..'))
sys.path.insert(1, os.path.abspath('../'))
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
from modules.utils import save_variables,save_descriptive_stats


In [2]:
analysis = 'general'
# Where to save numbers from the analysis
data_reports_path = Path('../../reports/numbers_updated.dat')

# Create the parent directory if it doesn't exist
data_reports_path.parent.mkdir(parents=True, exist_ok=True)

# Create the file if it doesn't exist
if not data_reports_path.exists():
    data_reports_path.touch()


---

### Summarize the sociodemographic information of the participants

In [9]:
donation_table = pd.read_csv(Path('../../data/raw/donation_table_CHB_filtered.csv'))
external_ids = donation_table['external_id']
pre_data = pd.read_excel(Path(f'../../data/raw/pre-survey_CHB.xlsx'))

# Filter pre_data to include only rows with external_id in final donation list
filtered_pre_data = pre_data[pre_data['external_id'].isin(external_ids)]
save_descriptive_stats(data_reports_path,f'{analysis}-age',filtered_pre_data ['age'])
higher_education = 0
for key in ['gender','employed','education']:
    key_data = filtered_pre_data[key].dropna()
    response_options = list(key_data.unique())
    for option in response_options:
        save_variables(data_reports_path,f'{analysis}-{option}',filtered_pre_data[key].dropna().value_counts()[option])   
        save_variables(data_reports_path,f'{analysis}-{key}-nan',len([item for item in filtered_pre_data[key] if type(item)!=str]))
        if key == 'education' and option in ['Master (or comparable)','Master (or equivalent)', 'Bachelor (or comparable)', 'Bachelor (or equivalent)','Doctoral degree']:
            higher_education += filtered_pre_data[key].value_counts()[option]
    save_variables(data_reports_path,f'{analysis}-education_higher',higher_education)
    print(filtered_pre_data[key].dropna().value_counts())

Male         33
Female       29
No answer     2
Diverse       1
Name: gender, dtype: int64
Yes          38
No           26
No answer     1
Name: employed, dtype: int64
Finished high school        32
Bachelor (or comparable)    18
Master (or comparable)      13
Vocational training          1
Doctoral degree              1
Name: education, dtype: int64


---

### Summarize information about the WhatsApp data donations

In [17]:
messages_file = pd.read_csv(Path('../../data/raw/messages_table_CHB_filtered.csv'))

# Compute the unique number of donation_ids
unique_donation_ids = messages_file['donation_id'].nunique()

# Compute the number of unique conversation_ids per donation_id
conversation_counts = messages_file.groupby('donation_id')['conversation_id'].nunique()

# Compute the message count per donation_id (number of rows per donation_id)
message_counts = messages_file['donation_id'].value_counts()

save_variables(data_reports_path,f'{analysis}-total_chats',conversation_counts.sum())
save_variables(data_reports_path,f'{analysis}-total_messages',message_counts.sum())
save_descriptive_stats(data_reports_path,f'{analysis}-chats_per_person',conversation_counts)
save_descriptive_stats(data_reports_path,f'{analysis}-messages_per_person',message_counts)


---

### Calculcate the minimum number of active months per donation to check if recency analysis makes sense

In [24]:
# Parse datetime
messages_file['datetime'] = pd.to_datetime(messages_file['datetime'])

# Extract year-month
messages_file['year_month'] = messages_file['datetime'].dt.to_period('M')

# Count distinct active months per donation_id
active_months = (
    messages_file.groupby('donation_id')['year_month']
    .nunique()
    .reset_index(name='active_months'))
print(f'Each participant had at least {active_months.min().active_months} months of data')

Each participant had at least 5 months of data
