In [None]:
# TODO: 
# - Group by users first
# - account for seasonal effects (?)

In [None]:
import pandas as pd
import datetime
from matplotlib import pyplot as plt
from textwrap import wrap
import numpy as np
import geopandas as gpd
from datenspende_who5.styling import hide_and_move_axis

In [None]:
def add_date_column(df):
    df['date'] = pd.to_datetime(df.created_at // 1000 // 60 // 60 // 24, unit='D')
    #df['date'] = df.date.dt.date
    
def drop_duplicate_entries(df):
    df.drop_duplicates(subset=['user_id', 'date', 'question', 'choice_id'], inplace=True)
    df.drop_duplicates(subset=['user_id', 'date', 'question'], keep=False, inplace=True)
    
def drop_creation_time_and_description(df):
    df.drop(columns=['created_at', 'description'], inplace=True)

def put_response_to_each_question_as_column(df):
    df = df.set_index(['user_id', 'date', 'question']).unstack()
    return df

def remove_incomplete_responses(df):
    df.dropna(inplace=True)

def simplify_column_names(df):
    df.columns = df.columns.droplevel(0)
    df.columns.names = [None]
    df.rename(columns={49: 'q49', 50: 'q50', 54: 'q54', 55: 'q55', 56: 'q56'}, inplace=True)
    
def set_user_and_date_as_column(df):
    df.reset_index(inplace=True)
    df.rename(columns={'user_id': 'userid'}, inplace=True)
    
def preprocess_survey_data():
    
    df = pd.read_feather('../data/01_raw/who5_responses.feather')

    add_date_column(df)
    drop_duplicate_entries(df)
    drop_creation_time_and_description(df)

    df = put_response_to_each_question_as_column(df)

    remove_incomplete_responses(df)
    simplify_column_names(df)
    set_user_and_date_as_column(df)    
    
    df['total_wellbeing'] = df[['q49', 'q50', 'q54', 'q55', 'q56']].mean(axis=1)
    
    return df


def preprocess_vital_data():
    
    df = pd.read_feather('../data/01_raw/vitals.feather')
    
    df = df.set_index(['userid', 'date', 'deviceid', 'vitalid']).unstack()
    df.columns = df.columns.droplevel(0)
    df.columns.names = [None]

    df.reset_index(inplace=True)

    # Compute onset and offset
    df[52] = (pd.to_datetime(df[52], unit='s') - df['date']) / pd.Timedelta(hours=1)
    df[53] = (pd.to_datetime(df[53], unit='s') - df['date']) / pd.Timedelta(hours=1)
    
    # Correct for DST
    df.loc[(df.date <= '2021-10-31') | df.date.between('2022-03-28', '2022-10-30'), 52] += 1 
    df.loc[(df.date <= '2021-11-01') | df.date.between('2022-03-28', '2022-10-31'), 53] += 1 
    
    # Remove dates past the end of Datenspende
    df = df[df.date <= '2022-12-31']
    
    # Remove Apple sleep data
    df.loc[df.deviceid == 6, [43, 52, 53]] = np.nan
    
    # Remove outliers
    vmin = df.quantile(.025)
    vmax = df.quantile(.975)

    for vital in (9, 65, 43, 52, 53):
        df.loc[~df[vital].between(vmin[vital], vmax[vital]), vital] = np.nan
    
    df['weekend'] = df.date.dt.dayofweek >= 5
    
    return df


def preprocess_users():
    
    users = pd.read_feather('../data/01_raw/users.feather')
    plz = pd.read_csv('../data/00_external/pc2020_DE_NUTS-2021_v3.0.csv', sep=';')

    plz.NUTS3 = plz.NUTS3.str.replace('\'', '')
    plz.CODE = plz.CODE.str.replace('\'', '')

    users = pd.merge(users, plz, left_on='zip_5digit', right_on='CODE')
    users.drop(columns=['creation_timestamp', 'CODE', 'zip_3digit'], inplace=True)

    return users


def load_wording():
    
    wording = pd.read_feather('../data/01_raw/who5_responses.feather')
    wording.drop(columns=['user_id', 'created_at', 'choice_id'], inplace=True)
    wording.drop_duplicates(inplace=True)
    wording.replace({i: f'q{i}' for i in wording.question}, inplace=True)
    wording = {entry[0]: entry[1] for i, entry in wording.iterrows()}
    wording['total_wellbeing'] = 'Mittleres Wohlbefinden'
    
    wording = {
        'q49': 'For the last four weeks I have been happy and in good spirits.',
        'q50': 'In the last four weeks I have felt calm and relaxed.',
        'q54': 'For the past four weeks, I have felt energetic and active.',
        'q55': 'For the past four weeks, I have felt fresh and rested when I wake up.',
        'q56': 'For the past four weeks, my daily life has been full of things that interest me.',
        'total_wellbeing': 'Average wellbeing'
    }
    
    return wording



def compute(vital, min_periods=14, subset=''):
    
    if subset == 'weekend':
        vital = vital[vital.weekend]
    elif subset == 'weekday':
        vital = vital[~vital.weekend]
        
    vitalids = [65, 9, 43, 52, 53]
    columns = {i: f'v{i}{subset}' for i in vitalids}
    
    dummy = pd.concat([vital, dummy_entries]).reset_index(drop=True)
    dummy.drop_duplicates(subset=['userid', 'date', 'deviceid'], keep='first', inplace=True)

    g = dummy.set_index('date').sort_index().groupby(['userid', 'deviceid']).rolling('28D',  min_periods=min_periods)
    g = g[9, 43, 65, 52, 53].mean()
    
    g.rename(columns=columns, inplace=True)
    g.reset_index(inplace=True)
    
    return g


In [None]:
survey = preprocess_survey_data()
vital = preprocess_vital_data()
users = preprocess_users()

In [None]:
entries = survey[['userid', 'date']]
devices = vital.deviceid.unique()

dummy_entries = pd.concat([entries] * len(devices))
dummy_entries['deviceid'] = np.repeat(devices, len(survey))

In [None]:
g = compute(vital, 14)
m = pd.merge(survey, g, on=['userid', 'date'])

g = compute(vital, 4, 'weekend')
m = pd.merge(m, g, on=['userid', 'date', 'deviceid'])

g = compute(vital, 10, 'weekday')
m = pd.merge(m, g, on=['userid', 'date', 'deviceid'])

# Remove entries where no vital data exists
m = m[m.isna().sum(axis=1) < 15]

m['social_jetlag'] = 0.5 * (m['v53weekend'] + m['v52weekend'] - m['v53weekday'] - m['v52weekday'])
m['midsleep'] = 0.5 * (m['v53'] + m['v52'])

m['v52difference'] = m['v52weekend'] - m['v52weekday']
m['v53difference'] = m['v53weekend'] - m['v53weekday']

m['v9difference'] = m['v9weekend'] - m['v9weekday']
m['v65difference'] = m['v65weekend'] - m['v65weekday']
m['v43difference'] = m['v43weekend'] - m['v43weekday']

In [None]:
m_per_user = m.drop(columns='date').groupby('userid').mean().reset_index()

In [None]:
label = {'v9': 'Steps', 
         'v65': 'Resting heart rate', 
         'v43': 'Sleep duration', 
         'v52': 'Sleep onset', 
         'v53': 'Sleep offset', 
         'midsleep': 'Midsleep', 
         'v43difference': 'Difference in Sleep Duration WE-WD',
         'v52difference': 'Difference in Sleep Onset  WE-WD',
         'v53difference': 'Difference in Sleep Offset WE-WD',
         'v9difference': 'Difference in steps WE-WD',
         'v65difference': 'Difference in RHR WE-WD',
         'social_jetlag': 'Social Jetlag',
        }

vital_key = 'v65difference'

def plot_survey_response_per_vitals(vital_key):
    f, axarr = plt.subplots(2, 3, sharex=True, figsize=(7,5))

    bins = 40
    question_keys = ['q49', 'q50', 'q54', 'q55', 'q56', 'total_wellbeing']
    marker = 0

    for question_key, ax in zip(question_keys, axarr.flatten()):
        df = m[[vital_key, question_key]].dropna()

        df['bins'] = pd.cut(df[vital_key], bins)
        df.bins = df['bins'].apply(lambda x: x.left + 0.5 * (x.right - x.left))

        df = df.groupby(['bins'])[question_key].agg(['mean', 'count', 'std'])
        df['err'] = 1.96 * df['std'] / np.sqrt(df['count'])
        df = df[df['count'] > 50]

        ax.errorbar(df.index, df['mean'], yerr=df['err'], fmt='o', markersize=5)
        ax.set_title('\n'.join(wrap(wording[question_key], 30)), size=10)

        if (vital_key == 'social_jetlag') or ('difference' in vital_key):
            if marker != 'None':
                ax.axvline(marker, c='k', ls=':')

        hide_and_move_axis(ax)
        
    for ax in axarr[1]:
        ax.set_xlabel(label[vital_key])

    plt.tight_layout()
    plt.savefig(f'../output/survey_response_per_{vital_key}.jpg', dpi=400)
    
    
for vital_key in label.keys():
    plot_survey_response_per_vitals(vital_key)

In [None]:
m_demog = pd.merge(m, users, left_on='userid', right_on='user_id')
m_demog['age'] = (2020 - m_demog.birth_date + 2.5) 

In [None]:
def plot_survey_response_per_vitals_gender(vital_key):
    f, axarr = plt.subplots(2, 3, sharex=True, figsize=(7,5))

    bins = 50
    question_keys = ['q49', 'q50', 'q54', 'q55', 'q56', 'total_wellbeing']
    marker = 0

    for question_key, ax in zip(question_keys, axarr.flatten()):
        for gender in ('M', 'F'):
            df = m_demog[m_demog.salutation == gender]
            df = df[[vital_key, question_key]].dropna()

            df['bins'] = pd.cut(df[vital_key], bins)
            df.bins = df['bins'].apply(lambda x: x.left + 0.5 * (x.right - x.left))

            df = df.groupby(['bins'])[question_key].agg(['mean', 'count', 'std'])
            df['err'] = 1.96 * df['std'] / np.sqrt(df['count'])
            df = df[df['count'] > 50]

            ax.errorbar(df.index, df['mean'], yerr=df['err'], fmt='o', markersize=5, label=gender)
            
        ax.set_title('\n'.join(wrap(wording[question_key], 30)), size=10)

        if (vital_key == 'social_jetlag') or ('difference' in vital_key):
            if marker != 'None':
                ax.axvline(marker, c='k', ls=':')

        hide_and_move_axis(ax)
        
    ax.legend()
    for ax in axarr[1]:
        ax.set_xlabel(label[vital_key])

    plt.tight_layout()
    plt.savefig(f'../output/gender_survey_response_per_{vital_key}.jpg', dpi=400)
    
    
for vital_key in label.keys():
    plot_survey_response_per_vitals_gender(vital_key)

In [None]:
def plot_survey_response_per_vitals_age(vital_key):
    f, axarr = plt.subplots(2, 3, sharex=True, figsize=(7,5))

    bins = 50
    question_keys = ['q49', 'q50', 'q54', 'q55', 'q56', 'total_wellbeing']
    marker = 0

    for question_key, ax in zip(question_keys, axarr.flatten()):
        for xmin, xmax in ((30, 65),) :
            df = m_demog[(m_demog.age > xmin) & (m_demog.age <= xmax)]
            df = df[[vital_key, question_key]].dropna()

            df['bins'] = pd.cut(df[vital_key], bins)
            df.bins = df['bins'].apply(lambda x: x.left + 0.5 * (x.right - x.left))

            df = df.groupby(['bins'])[question_key].agg(['mean', 'count', 'std'])
            df['err'] = 1.96 * df['std'] / np.sqrt(df['count'])
            df = df[df['count'] > 50]

            ax.errorbar(df.index, df['mean'], yerr=df['err'], fmt='o', markersize=5, label=f'{xmin} - {xmax}')
            
        ax.set_title('\n'.join(wrap(wording[question_key], 30)), size=10)

        if (vital_key == 'social_jetlag') or ('difference' in vital_key):
            if marker != 'None':
                ax.axvline(marker, c='k', ls=':')

        hide_and_move_axis(ax)
        
    ax.legend()
    for ax in axarr[1]:
        ax.set_xlabel(label[vital_key])

    plt.tight_layout()
    plt.savefig(f'../output/age_survey_response_per_{vital_key}.jpg', dpi=400)
    
    
for vital_key in label.keys():
    plot_survey_response_per_vitals_age(vital_key)

In [None]:
# sandbox

In [None]:
import matplotlib

shapes = gpd.read_file('../data/00_external/nuts3de.json')

m_spatial = m.groupby('userid').mean().reset_index()
m_spatial = pd.merge(m_spatial, users, left_on='userid', right_on='user_id')
m_spatial.rename(columns={'NUTS3': 'NUTS'}, inplace=True)
m_spatial['federal_states'] = m_spatial.NUTS.str[:-2]

m_spatial.NUTS = m_spatial.NUTS.str[:-1]
shapes['NUTS'] = shapes.id.str[:-1]

g = m_spatial.groupby('NUTS')[['q49', 'q50', 'q54', 'q55', 'q56', 'total_wellbeing']].mean()

NUTS = shapes[['NUTS', 'geometry']].dissolve(by='NUTS')

g = pd.merge(g.reset_index(), NUTS, left_on='NUTS', right_on='NUTS')
g = gpd.GeoDataFrame(g)

f, axarr = plt.subplots(2, 3, figsize=(12, 9))

cmap = matplotlib.colormaps['Purples']
cmap = cmap.resampled(4)

for question, ax in zip(['q49', 'q50', 'q54', 'q55', 'q56', 'total_wellbeing'], axarr.flatten()):
    c = g.plot(question, cmap=cmap, legend=True, ax=ax, vmin=g[question].quantile(.1), vmax=g[question].quantile(.9))
    #g['plot'] = g[question] > g[question].quantile(.8)
    #c = g.plot('plot', cmap=cmap, legend=True, ax=ax)
    
    NUTS1.plot(color='None', ax=ax, lw=1)
    ax.set_title('\n'.join(wrap(wording[question], 30)), size=10)
    
plt.tight_layout()