### This notebook is used to derive metrics for quantifying the participants' activity times.



In [1]:
import os
import sys
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
warnings.filterwarnings("ignore")
sys.path.insert(1, os.path.abspath('../../..'))
sys.path.insert(1, os.path.abspath('../'))

raw_data_path = "../../data/raw"
processed_data_path = "../../data/processed"

### Load messaging data

In [2]:
# Load the donation info from the data dable
donation_table = pd.read_csv(Path(f'{raw_data_path}/donation_table_CHB_filtered.csv'))

# Load donated messages from the relevant donations (e.g. those who filled in both surveys)
messages_table = pd.read_csv(Path(f'{raw_data_path}/messages_table_CHB_filtered.csv'))
messages_table['datetime'] = pd.to_datetime(messages_table['datetime']) # ensure the date is in datetime format

### Calculate the response probabilities of the participants (egos) and their contacts (alters) within the specified bins.

In [3]:
from modules.utils import bin_probability
from modules.metrics import normalized_entropy

bins = [[0,6],[6,12],[12,18],[18,24]] # [lower, upper) for bin calculation!
bin_columns = ['activity-00:00-05:59','activity-06:00-11:59','activity-12:00-17:59','activity-18:00-23:59']
bin_labels = ['00:00 - 05:59','06:00 - 11:59','12:00 - 17:59','18:00 - 23:59']

info_dict = {}

for donation_id, donor_id, external_id in zip(donation_table['donation_id'], donation_table['donor_id'], donation_table['external_id']):
    ego_messages = messages_table[messages_table['sender_id'] == donor_id].copy()

    # Extract hour and date
    ego_messages['hour'] = ego_messages['datetime'].dt.hour
    ego_messages['date'] = ego_messages['datetime'].dt.date

    # Calculate bin proportions and entropy
    bin_proportions = bin_probability(ego_messages['hour'].values, bins, bin_columns)
    entropy = normalized_entropy(list(bin_proportions.values()))

    # Calculate daily active hour stats
    daily_hours = ego_messages.groupby('date')['hour'].nunique()
    daily_stats = {'mean_daily_hours': daily_hours.mean(),
        'median_daily_hours': daily_hours.median(),
        'std_daily_hours': daily_hours.std(),
        'max_daily_hours': daily_hours.max()}

    # Combine all info
    info = {'normalized_entropy': entropy,
        **daily_stats,
        **bin_proportions}
    info_dict[external_id] = info

# Final DataFrame
times_table = pd.DataFrame.from_dict(info_dict, orient='index').reset_index().rename(columns={'index': 'external_id'})


### Absolute activity time saving

Combine the activity metrics with the donor self-reports and save it all in a table for later analysis the activity time information with the donor self-reports and save it all in a table for later analysis

In [4]:
pre_survey = pd.read_excel(Path(f'{raw_data_path}/pre-survey_CHB.xlsx'))[['external_id']+bin_columns]
post_survey = pd.read_excel(Path(f'{raw_data_path}/post-survey_CHB.xlsx'))[['external_id']+bin_columns]

# Map the strings to numerical values
bin_response_mapping = {'Yes': 1, 'No': 0}
for col in bin_columns:
    pre_survey[f'{col}'] = pre_survey[f'{col}'].map(bin_response_mapping)
    post_survey[f'{col}'] = post_survey[f'{col}'].map(bin_response_mapping)
    
absolute_survey = pd.merge(pre_survey, post_survey, on='external_id', how='inner', suffixes=('_pre', '_post'))
absolute_survey = absolute_survey.dropna(subset=[f"{col}_pre" for col in bin_columns] + [f"{col}_post" for col in bin_columns])
absolute_times_data = pd.merge(absolute_survey,times_table[['external_id']+bin_columns],on='external_id', how ='inner')
absolute_times_data.to_excel(Path(f'{processed_data_path}/absolute_times_data.xlsx'),index=False)

### Relative activity time saving

Combine the relative activity metrics with the donor self-reports and save it all in a table for later analysis the activity time information with the donor self-reports and save it all in a table for later analysis

In [5]:
from modules.utils import map_7point_likert
question_column = 'texting_all_day'

# Load and transform the question columns relevant for this aspect
pre_survey = map_7point_likert(Path(f'{raw_data_path}/pre-survey_CHB.xlsx'), question_column)
post_survey = map_7point_likert(Path(f'{raw_data_path}/post-survey_CHB.xlsx'), question_column)
relative_survey= pd.merge(pre_survey, post_survey, on='external_id', how='inner', suffixes=('_pre', '_post'))
relative_survey[f'{question_column}_diff'] = relative_survey[f'{question_column}_post'] - relative_survey[f'{question_column}_pre']

# Pair objective data with the subjective assessments based on exterenal_id
time_obj = times_table[['external_id','normalized_entropy','mean_daily_hours','median_daily_hours','std_daily_hours','max_daily_hours']]
all_data = pd.merge(relative_survey,time_obj,on='external_id', how ='inner')
all_data.to_excel(Path(f'{processed_data_path}/relative_times_data.xlsx'),index=False)
