In [None]:
# TODO: 
# - Deal with Apple users!
# - Deal with multiple devices per user!

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datenspende_who5 import styling, utils
from textwrap import wrap
import datetime
from pathlib import Path

In [None]:
LABEL = {65: 'RHR', 9: 'Activity', 43: 'Sleep duration', 52: 'Sleep onset', 53: 'Sleep offset'}  

QUESTION_IDS = [49, 50, 54, 55, 56]
VITAL_IDS = [65, 9, 43, 52, 53]

OUTPUT_FOLDER = '../output/{0}/'.format(datetime.datetime.now())
Path(OUTPUT_FOLDER).mkdir(parents=True, exist_ok=True)

# Utils

In [None]:
def get_title(question, n_wrap):
    
    title = DATA[DATA.question == question].description.values[0]
    title = '\n'.join(wrap(title, n_wrap))

    return title    


def add_age(df, groups={'<40': (0, 40), '40-65': (40, 65), '65+': (65, 100)}):
    
    df['age'] = 2022 - df.birth_date + 2.5

    for label, interval in groups.items():
        vmin, vmax = interval
        mask = (df.age >= vmin) & (df.age < vmax)
        df.loc[mask, 'age_group'] = label 
    
    df.dropna(subset='age_group', inplace=True)

# Load & Preprocess Data

In [None]:
DATA = pd.read_feather('../data/03_derived/full_data_binned.feather')

# Rebin data
utils.bin_data(DATA, intervals={52: 0.25, 53: 0.25, 65: 2})

# Set intervals for values that contain all days (weekends + weekdays)
utils.remove_implausible(DATA, ranges={9: (0, 15000), 65: (45, 80), 52: (-3, 2), 43: {240, 540}, 53: (4, 9)}, columns=['value_binned'])

# Add user ages
add_age(DATA)

# Average vital data for each survey response

In [None]:
def plot_average_vitals_per_survey_response(df, outfile='1.02a-average_vital_per_survey_response.jpg'):
    
    f, axarr = plt.subplots(5, 5, figsize=(10, 7), sharey='row', sharex=True)
    
    df = utils.average(df, value='value', by=['question', 'vitalid', 'choice_id'])
        
    for row_id, question in enumerate(QUESTION_IDS):
        for col_id, vital in enumerate(VITAL_IDS):
            
            plot_data = df.loc[question, vital]
            x, y, err = plot_data.index, plot_data['mean'], plot_data['err']
            
            axarr[col_id, row_id].errorbar(x, y, yerr=err, fmt='o', markersize=4)
            axarr[col_id, 0].set_ylabel(LABEL[vital]) 

        axarr[0, row_id].set_title(get_title(question, 30), size=8)

    for ax in axarr.flatten():
        styling.hide_and_move_axis(ax)
        ax.set_xticks([1, 2, 3, 4, 5])

    plt.tight_layout()
    plt.savefig(OUTPUT_FOLDER+outfile, dpi=400)


In [None]:
plot_average_vitals_per_survey_response(DATA)

# Average survey response for given vital data

In [None]:
def plot_average_survey_response_per_vital(df, outfile='1.02b-average_survey_response_per_vital.jpg'):
    
    f, axarr = plt.subplots(5, 5, figsize=(10, 7), sharex='col')

    df = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned'])
        
    for row_id, question in enumerate(QUESTION_IDS):
        for col_id, vital in enumerate(VITAL_IDS):
            
            plot_data = df.loc[question, vital]
            x, y, err = plot_data.index, plot_data['mean'], plot_data['err']

            axarr[row_id, col_id].errorbar(x, y, yerr=err, fmt='o', markersize=4)
            axarr[-1, col_id].set_xlabel(LABEL[vital]) 

        axarr[row_id, 0].set_ylabel(get_title(question, 22), size=8)
        
    for ax in axarr.flatten():
        styling.hide_and_move_axis(ax)

    plt.tight_layout()
    plt.savefig(OUTPUT_FOLDER+outfile, dpi=400)

In [None]:
plot_average_survey_response_per_vital(DATA)

# Total wellbeing

In [None]:
def plot_total_wellbeing(df, vitals=(65, 9), outfile='1.02c-average_wellbeing.jpg'):

    f, axarr = plt.subplots(1, len(vitals), figsize=(2 + 2 * len(vitals), 2.8), sharey=True)
    all_users = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned'])
    
    for i, vital in enumerate(vitals):
        plot = all_users.loc[60, vital]
        x, y, err = plot.index, plot['mean'], plot.err * 1.96
        axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5)
        axarr[i].set_xlabel(LABEL[vital])

    axarr[0].set_ylabel('Average Wellbeing')
    
    for ax in axarr:
        styling.hide_and_move_axis(ax)

    plt.tight_layout()
    plt.savefig(OUTPUT_FOLDER+outfile, dpi=400)

In [None]:
plot_total_wellbeing(DATA, vitals=(65, 9), outfile='1.02c-average_wellbeing_rhr_steps.jpg')
plot_total_wellbeing(DATA, vitals=(43, 52, 53), outfile='1.02c-average_wellbeing_sleep.jpg')

# Discriminate by gender

In [None]:
def plot_total_wellbeing_per_gender(df, vitals=(65, 9), outfile='1.02d-average_wellbeing_by_gender.jpg'):

    f, axarr = plt.subplots(1, len(vitals), figsize=(2 + 2 * len(vitals), 2.8), sharey=True)
    label = {'M': 'Male', 'F': 'Female'}
    
    all_users = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned'])
    per_gender = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned', 'salutation'])

    for i, vital in enumerate(vitals):
        plot = all_users.loc[60, vital]
        x, y, err = plot.index, plot['mean'], plot.err * 1.96
        axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5, label='All users')

        for salutation in ('M', 'F'):
            plot = per_gender.loc[60, vital, :, salutation]
            x, y, err = plot.index, plot['mean'], plot.err * 1.96
            axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5, label=label[salutation])    
        
        axarr[i].set_xlabel(LABEL[vital])

    axarr[0].set_ylabel('Average Wellbeing')
    axarr[1].legend()

    for ax in axarr:
        styling.hide_and_move_axis(ax)

    plt.tight_layout()
    plt.savefig(OUTPUT_FOLDER+outfile, dpi=400)

In [None]:
plot_total_wellbeing_per_gender(DATA, vitals=(65, 9, 52, 53))

# Discriminate by age

In [None]:
def plot_total_wellbeing_per_age(df, vitals=(65, 9), outfile='1.02e-average_wellbeing_by_age.jpg'):

    f, axarr = plt.subplots(1, len(vitals), figsize=(2 + 2 * len(vitals), 2.8), sharey=True)

    all_users = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned'])
    per_group = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned', 'age_group'])

    for i, vital in enumerate(vitals):
        plot = all_users.loc[60, vital]
        x, y, err = plot.index, plot['mean'], plot.err * 1.96
        axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5, label='All users')

        for age_group in df.age_group.unique():
            plot = per_group.loc[60, vital, :, age_group]
            x, y, err = plot.index, plot['mean'], plot.err * 1.96
            axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5, label=age_group)

        axarr[i].set_xlabel(LABEL[vital])
    
    axarr[0].set_ylabel('Average Wellbeing')
    axarr[-1].legend()

    for ax in axarr:
        styling.hide_and_move_axis(ax)
    
    plt.tight_layout()

    plt.savefig(OUTPUT_FOLDER+outfile, dpi=400)

In [None]:
plot_total_wellbeing_per_age(DATA, vitals=(65, 9, 52, 53))

# Sandbox



In [None]:
import sys
sys.exit(0)

## Some statistics

In [None]:
users = pd.read_feather('../data/02_processed/users.feather')

In [None]:
f, ax = plt.subplots(figsize=(6, 4))

BINS = bins=np.arange(1927.5, 2010, 5)

count, bins = np.histogram(users.loc[users.salutation == 'M'].birth_date, bins=BINS, density=True)
bins = .5 * (bins[:-1] + bins[1:])

ax.bar(bins-1, count, width=2, label='M')

count, bins = np.histogram(users.loc[users.salutation == 'F'].birth_date, bins=BINS, density=True)
bins = .5 * (bins[:-1] + bins[1:])

ax.bar(bins+1, count, width=2, label='F')

ax.legend()

styling.hide_and_move_axis(ax)
ax.set_xticks(bins)
ax.set_xticklabels(bins.astype(int), rotation=45, ha='right')
ax.set_xlabel('Year of Birth')
ax.set_ylabel('Relative Frequency')

plt.savefig('../output/age_distribution.jpg', dpi=400)

# Sleep 

In [None]:
def plot_total_wellbeing_sleep(df):

    f, axarr = plt.subplots(1, 2, figsize=(6, 2.8), sharey=True)

    df = utils.average(df, value='choice_id', by=['question', 'vitalid', 'value_binned'])

    for i, vital in enumerate((52, 53)):
        plot = df.loc[60, vital]
        x, y, err = plot.index, plot['mean'], 1.96 * plot.err
        axarr[i].errorbar(x, y, yerr=err, fmt='o', markersize=5)

#axarr[0].set_xlabel('RHR [bpm]')
#axarr[0].set_ylabel('Average Wellbeing')
#axarr[1].set_xlabel('Activity [Steps]')
#axarr[1].legend()

for ax in axarr:
    styling.hide_and_move_axis(ax)
    
plt.tight_layout()

#plt.savefig('../output/average_wellbeing_by_gender.jpg', dpi=400)

In [None]:
_df = data[(data.vitalid == 52) & (data.question == 60)]
plt.scatter(_df['value_weekday_binned'], _df['value_weekend_binned'], alpha=0.005)

In [None]:
onset = data[(data.question == 60) & (data.vitalid == 52)][['user_id', 'date', 'choice_id', 'value_binned']]
offset = data[(data.question == 60) & (data.vitalid == 53)][['user_id', 'date', 'choice_id', 'value_binned']]

In [None]:
df = pd.merge(onset, offset, on=['user_id', 'date', 'choice_id'])
df = df.groupby(['value_binned_x', 'value_binned_y']).choice_id.agg(['mean', 'std', 'count'])

In [None]:
onset_values = np.round(np.arange(-3, 2.2, 0.2), 1)
offset_values = np.round(np.arange(4.6, 10, 0.2), 1)

In [None]:
results = np.zeros((len(onset_values), len(offset_values)))

for i, x in enumerate(onset_values):
    for j, y in enumerate(offset_values):
        if (x, y) in df.index and df.loc[x, y]['count'] > 100:
            results[i, j] = df.loc[x, y]['mean']
        else:
            results[i, j] = np.nan

In [None]:
f, ax = plt.subplots()
ax.imshow(results.T, origin='lower', extent=[-3, 2.2, 4.6, 10], vmin=3.1, vmax=3.3)

In [None]:
plt.contourf()

In [None]:
plt.plot(np.nanmean(results, axis=1))

In [None]:
plt.hist(results.flatten())

# Unsorted

In [None]:
df = average_survey_response_per_vital(data)

In [None]:
rhr = data[data.vitalid == 65]
steps = data[data.vitalid == 9]

combined = pd.merge(rhr, steps, on=['user_id', 'date', 'question', 'choice_id', 'description'])

cb = combined.groupby(['value_binned_x', 'value_binned_y']).choice_id.agg(['mean', 'std', 'count'])

x_values = np.arange(45, 80, 1)
y_values = np.arange(1000, 15001, 1000)

results = np.zeros((len(x_values), len(y_values)))

for i, x in enumerate(x_values):
    for j, y in enumerate(y_values):
        if (x, y) in cb.index and cb.loc[x, y]['count'] > 200:
                results[i, j] = cb.loc[x, y]['mean']
        else:
            results[i, j] = np.nan

In [None]:
f, axarr = plt.subplots(1, 3, figsize=(9, 2.8))

ax0, ax1, ax2 = axarr

plot = df.loc[60, 65].loc[:80:2]
ax0.errorbar(plot.index, plot['mean'], yerr=1.96 * plot.err, fmt='o', markersize=5, c='purple')

plot = df.loc[60, 9].loc[:15000]
ax1.errorbar(plot.index, plot['mean'], yerr=1.96 * plot.err, fmt='o', markersize=5, c='purple')    

ax = axarr[2]
cmap = matplotlib.colormaps['Purples']
cmap = cmap.resampled(5)
cmap.set_bad('0.9')

axarr[0].set_xlabel('RHR [bpm]')
axarr[1].set_xlabel('Activity [Steps]')

axarr[0].set_ylabel('Average Wellbeing')
axarr[1].set_ylabel('Average Wellbeing')

axarr[0].set_ylim(2.9, 3.5)
axarr[1].set_ylim(2.9, 3.5)

c = ax.imshow(results.T, origin='lower', cmap=cmap, vmin=3, vmax=3.5, aspect=0.0027, extent=[45, 80, 1000, 15000])
styling.hide_and_move_axis(ax)
ax.set_xlabel('RHR [bpm]')
ax.set_ylabel('Activity [steps]')

ax = f.add_axes([1, 0.225, 0.01, 0.6])

plt.colorbar(c, extend='both', cax=ax, label='Average Wellbeing')
for ax in axarr:
    styling.hide_and_move_axis(ax)

plt.tight_layout()

plt.savefig('../output/1.02c-average_wellbeing.jpg', dpi=400, bbox_inches='tight')

## Heart rate over steps

In [None]:
combined['rhr_over_steps'] = combined.value_binned_x / combined.value_y 
#combined.rhr_over_steps = np.round(combined.rhr_over_steps, 3) 
combined.rhr_over_steps = np.exp(np.round(np.log(combined.rhr_over_steps), 1))

In [None]:
cb = combined.groupby('rhr_over_steps').choice_id.agg(['mean', 'std', 'count'])
cb['err'] = cb['std'] / np.sqrt(cb['count'])

f, ax = plt.subplots(figsize = (5, 4))

ax.errorbar(cb.index, cb['mean'], yerr=1.96 * cb['err'], fmt='o')

ax.set_xlabel('RHR / Activity [bpm / steps]')
ax.set_ylabel('Average Wellbeing')
ax.semilogx()
styling.hide_and_move_axis(ax)

## Vital data per each choice

In [None]:
data[['description', 'question']].drop_duplicates()

In [None]:
f, axarr = plt.subplots(1, 5, figsize=(20, 5), sharex=True, sharey=True)

for i in range(1, 6):

    df = data[(data.question == 60) & (data.choice_id == i)]
    rhr = df[df.vitalid == 52]
    steps = df[df.vitalid == 53]

    df = pd.merge(rhr, steps, on=['user_id', 'date'])
    axarr[i-1].hist2d(df['value_x'], df['value_y'], label=i, density=True, bins=20)
    #axarr[i-1].axhline(df['value_y'].mean(), c='r')
    #axarr[i-1].axvline(df['value_x'].mean(), c='r')

    
plt.legend()
#plt.semilogy()
    #plt.scatter(df['value_x'].mean(), df['value_y'].mean(), s=20, zorder=10)