In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import statsmodels.formula.api as smf

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm, trange

from IPython.display import display

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

In [None]:
colors = sns.color_palette('bright', 10)

elite_color = colors[3]
radio_color = colors[0]
decahose_color = colors[9]
decahose2_color = colors[4]

# Load data

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs.npy', 'rb') as f:
    cdf_vals = np.load(f)

In [None]:
stats = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index_col='story_id')

assert stats.index.is_unique
assert (stats.groupby(['year', 'kind']).size() == stats.groupby(['year', 'kind'])['group'].nunique()).all()

In [None]:
selected = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv', index_col='story_id')
assert selected.index.is_unique

covid_threshold = 0.1
selected['decahose_covid_2020'] = (selected['year'] == 2020) & (selected['kind'] == 'decahose') & (selected['covid'] >= covid_threshold)
stats['decahose_covid_2020'] = selected['decahose_covid_2020']
stats['decahose_covid_2020'] = stats['decahose_covid_2020'].fillna(False)

In [None]:
matching = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-matching.csv')

matching.shape

## Quality filter

In [None]:
length_mask = (stats['count'] >= 10)

matching_mask = stats.index.isin(
    matching['story_id_elite'].tolist() +
    matching['story_id_radio'].tolist() +
    matching.loc[matching['story_id_decahose'].notna(), 'story_id_decahose'].tolist()
)

mask = (
    length_mask
    & matching_mask
)

mask.sum(), stats.shape[0]

# Descriptive stats

In [None]:
stats.groupby([mask, 'year', 'kind']).size()

In [None]:
stats.groupby([mask, 'year']).size()

In [None]:
stats.groupby([mask, 'kind']).size()

In [None]:
stats.groupby([mask, 'kind'])['count'].sum()

In [None]:
stats.groupby([mask, 'year', 'kind'])['count'].sum()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x):
    display(stats.loc[mask].describe())

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask].groupby('kind').describe().T)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask].groupby(['year', 'kind']).describe().T)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask].groupby('kind').describe()[['avg', 'std']].T)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask].groupby(['kind', 'year']).describe()[['avg', 'std']].T)

In [None]:
stats.loc[mask].groupby(['year', 'kind'])[['start', 'end', 'dur']].describe().T

In [None]:
stats.loc[mask, 'dur'].hist(bins=50)

# Summary metrics based on relative time

In [None]:
tests = []
for metric in ['avg', 'std']:
    el = stats.reindex(matching['story_id_elite'])
    rd = stats.reindex(matching['story_id_radio'])
    dh = stats.reindex(matching['story_id_decahose'])

    dhnc = dh.copy()
    dhnc.loc[dhnc['decahose_covid_2020'].notna() & dhnc['decahose_covid_2020'], metric] = np.nan

    el = el[metric].to_numpy()
    rd = rd[metric].to_numpy()
    dh = dh[metric].to_numpy()
    dhnc = dhnc[metric].to_numpy()

    # Elite vs radio
    inc = ~np.isnan(el) & ~np.isnan(rd)
    stat = ss.ttest_rel(el[inc], rd[inc])
    tests += [{
        'metric': metric,
        'kind1': 'elite',
        'kind2': 'radio',
        'statistic': stat.statistic,
        'pvalue': stat.pvalue,
        'df': stat.df,
    }]

    # Elite vs decahose
    inc = ~np.isnan(el) & ~np.isnan(dh)
    stat = ss.ttest_rel(el[inc], dh[inc])
    tests += [{
        'metric': metric,
        'kind1': 'elite',
        'kind2': 'decahose',
        'statistic': stat.statistic,
        'pvalue': stat.pvalue,
        'df': stat.df,
    }]

    # Decahose vs radio
    inc = ~np.isnan(dh) & ~np.isnan(rd)
    stat = ss.ttest_rel(dh[inc], rd[inc])
    tests += [{
        'metric': metric,
        'kind1': 'decahose',
        'kind2': 'radio',
        'statistic': stat.statistic,
        'pvalue': stat.pvalue,
        'df': stat.df,
    }]

    # Elite vs decahose, excluding 2020 Covid stories
    inc = ~np.isnan(el) & ~np.isnan(dhnc)
    stat = ss.ttest_rel(el[inc], dhnc[inc])
    tests += [{
        'metric': metric,
        'kind1': 'elite',
        'kind2': 'decahose-no-2020-covid',
        'statistic': stat.statistic,
        'pvalue': stat.pvalue,
        'df': stat.df,
    }]

    # Decahose vs radio, excluding 2020 Covid stories
    inc = ~np.isnan(dhnc) & ~np.isnan(rd)
    stat = ss.ttest_rel(dhnc[inc], rd[inc])
    tests += [{
        'metric': metric,
        'kind1': 'decahose-no-2020-covid',
        'kind2': 'radio',
        'statistic': stat.statistic,
        'pvalue': stat.pvalue,
        'df': stat.df,
    }]

tests = pd.DataFrame(tests) \
    .sort_values(['kind1', 'kind2'], ascending=False) \
    [['kind1', 'kind2', 'metric', 'statistic', 'pvalue', 'df']] \
    .reset_index(drop=True)

tests['id'] = tests['kind1'] + '-' + tests['kind2'] + '-' + tests['metric']
tests.set_index('id', inplace=True)

tests

# Visualize reltime mean/SD

## Elite vs radio

In [None]:
nbins = 7

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
twins = [ax.twinx() for ax in axes]

for twin in twins[1:]:
    twins[0].get_shared_y_axes().join(twins[0], twin)
twins[0].autoscale()
for twin in twins[:-1]:
    twin.yaxis.set_tick_params(labelright=False)

sns.histplot(stats.loc[mask & (stats['kind'] == 'elite'), 'avg'], ax=axes[0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'avg'], ax=twins[0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

sns.histplot(stats.loc[mask & (stats['kind'] == 'elite'), 'std'], ax=axes[1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'std'], ax=twins[1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)

axes[0].set_ylabel(None)
twins[0].set_ylabel(None)
axes[0].set_xlabel('Duration')
axes[0].xaxis.set_tick_params(rotation=45)
axes[0].xaxis.set_major_formatter(fmt)

h1, l1 = axes[0].get_legend_handles_labels()
h2, l2 = twins[0].get_legend_handles_labels()
axes[0].legend(h1 + h2, l1 + l2, loc=0)

axes[1].set_ylabel(None)
twins[1].set_ylabel(None)
axes[1].set_xlabel('Duration')
axes[1].xaxis.set_tick_params(rotation=45)
axes[1].xaxis.set_major_formatter(fmt)

axes[0].set_ylabel('Count (Elite)', fontsize=11)
twins[1].set_ylabel('Count (Radio)', fontsize=11, rotation=270, labelpad=15)

h1, l1 = axes[1].get_legend_handles_labels()
h2, l2 = twins[1].get_legend_handles_labels()
axes[1].legend(h1 + h2, l1 + l2, loc=0)

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{E}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['elite-radio-avg', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['elite-radio-avg', 'pvalue'], ),
))
axes[0].text(0.77, 0.7, textstr, transform=axes[0].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{E}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['elite-radio-std', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['elite-radio-std', 'pvalue'], ),
))
axes[1].text(0.77, 0.7, textstr, transform=axes[1].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

axes[0].set_title('Mean Story-Level Relative Time')
axes[1].set_title('SD Story-Level Relative Time')

fig.tight_layout()

## Decahose vs radio

In [None]:
nbins = 7

fig, axes = plt.subplots(1, 2, figsize=(10, 5))
twins = [ax.twinx() for ax in axes]

sns.histplot(stats.loc[mask & (stats['kind'] == 'decahose'), 'avg'], ax=axes[0], color=decahose_color, alpha=0.3, label='Decahose', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'avg'], ax=twins[0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

sns.histplot(stats.loc[mask & (stats['kind'] == 'decahose'), 'std'], ax=axes[1], color=decahose_color, alpha=0.3, label='Decahose', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'std'], ax=twins[1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)

twins[0].set_ylabel(None)
axes[0].set_xlabel('Mean Within-Story Time')
axes[0].xaxis.set_tick_params(rotation=45)
axes[0].xaxis.set_major_formatter(fmt)

h1, l1 = axes[0].get_legend_handles_labels()
h2, l2 = twins[0].get_legend_handles_labels()
axes[0].legend(h1 + h2, l1 + l2, loc=0)

twins[1].set_ylabel(None)
axes[1].set_xlabel('SD of Within-Story Time')
axes[1].xaxis.set_tick_params(rotation=45)
axes[1].xaxis.set_major_formatter(fmt)

h1, l1 = axes[1].get_legend_handles_labels()
h2, l2 = twins[1].get_legend_handles_labels()
axes[1].legend(h1 + h2, l1 + l2, loc=0)

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{D}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['decahose-radio-avg', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['decahose-radio-avg', 'pvalue'], ),
))
axes[0].text(0.65, 0.65, textstr, transform=axes[0].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{D}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['decahose-radio-std', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['decahose-radio-std', 'pvalue'], ),
))
axes[1].text(0.65, 0.65, textstr, transform=axes[1].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

fig.tight_layout()

## Decahose vs radio, excluding Covid 2020

In [None]:
nbins = 6

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=True)
twins = [ax.twinx() for ax in axes]

for twin in twins[1:]:
    twins[0].get_shared_y_axes().join(twins[0], twin)
twins[0].autoscale()
for twin in twins[:-1]:
    twin.yaxis.set_tick_params(labelright=False)

sns.histplot(stats.loc[mask & (stats['kind'] == 'decahose') & ~stats['decahose_covid_2020'], 'avg'], ax=axes[0], color=decahose2_color, alpha=0.3, label='Firehose', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'avg'], ax=twins[0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

sns.histplot(stats.loc[mask & (stats['kind'] == 'decahose') & ~stats['decahose_covid_2020'], 'std'], ax=axes[1], color=decahose2_color, alpha=0.3, label='Firehose', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['kind'] == 'radio'), 'std'], ax=twins[1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)

axes[0].set_ylabel(None)
twins[0].set_ylabel(None)
axes[0].set_xlabel('Duration')
axes[0].xaxis.set_tick_params(rotation=45)
axes[0].xaxis.set_major_formatter(fmt)

h1, l1 = axes[0].get_legend_handles_labels()
h2, l2 = twins[0].get_legend_handles_labels()
axes[0].legend(h1 + h2, l1 + l2, loc=0)

axes[1].set_ylabel(None)
twins[1].set_ylabel(None)
axes[1].set_xlabel('Duration')
axes[1].xaxis.set_tick_params(rotation=45)
axes[1].xaxis.set_major_formatter(fmt)

axes[0].set_ylabel('Count (Firehose)', fontsize=11)
twins[1].set_ylabel('Count (Radio)', fontsize=11, rotation=270, labelpad=15)

h1, l1 = axes[1].get_legend_handles_labels()
h2, l2 = twins[1].get_legend_handles_labels()
axes[1].legend(h1 + h2, l1 + l2, loc=0)

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{D}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['decahose-no-2020-covid-radio-avg', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['decahose-no-2020-covid-radio-avg', 'pvalue'], ),
))
axes[0].text(0.77, 0.7, textstr, transform=axes[0].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{D}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (tests.loc['decahose-no-2020-covid-radio-std', 'statistic'], ),
    r'$p \sim %.3f$' % (tests.loc['decahose-no-2020-covid-radio-std', 'pvalue'], ),
))
axes[1].text(0.77, 0.7, textstr, transform=axes[1].transAxes, fontsize=12,
             verticalalignment='top', bbox=props)

axes[0].set_title('Mean Story-Level Relative Time')
axes[1].set_title('SD Story-Level Relative Time')

fig.tight_layout()

# Vincentized ecdf / epdf

We get the epdf from the cdf via differentials (`np.gradient`).

In [None]:
# as in the 5a notebook that calculated these
cdf_query_end = 7 * 24 * 3600
cdf_query_inc = 60  # one minute; afterward we can sample to a higher frequency if desired

cdf_ovrl = pd.DataFrame([
    cdf_vals[mask & (stats['kind'] == 'elite'), ...].mean(axis=0),
    cdf_vals[mask & (stats['kind'] == 'decahose'), ...].mean(axis=0),
    cdf_vals[mask & (stats['kind'] == 'decahose') & ~stats['decahose_covid_2020'], ...].mean(axis=0),
    cdf_vals[mask & (stats['kind'] == 'radio'), ...].mean(axis=0),
], index=['elite', 'decahose', 'decahose-no-covid-2020', 'radio']).T
cdf_ovrl.index = (cdf_ovrl.index.to_series() * cdf_query_inc)
pdf_ovrl = cdf_ovrl.copy()
pdf_ovrl['elite'] = np.gradient(pdf_ovrl['elite'])
pdf_ovrl['decahose'] = np.gradient(pdf_ovrl['decahose'])
pdf_ovrl['decahose-no-covid-2020'] = np.gradient(pdf_ovrl['decahose-no-covid-2020'])
pdf_ovrl['radio'] = np.gradient(pdf_ovrl['radio'])

cdf_ovrl.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}, axis=1, inplace=True)
pdf_ovrl.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}, axis=1, inplace=True)

## 48 hours

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharex=True)

# in units of cdf_query_inc
plot_max = 48 * 60
plot_interval = 15

cdf_ovrl.iloc[0:plot_max:plot_interval]['Elite'].plot(ax=axes[0], color=elite_color)
pdf_ovrl.iloc[0:plot_max:plot_interval]['Elite'].plot(ax=axes[1], color=elite_color)

cdf_ovrl.iloc[0:plot_max:plot_interval]['Radio'].plot(ax=axes[0], color=radio_color)
pdf_ovrl.iloc[0:plot_max:plot_interval]['Radio'].plot(ax=axes[1], color=radio_color)

cdf_ovrl.iloc[0:plot_max:plot_interval]['Firehose'].plot(ax=axes[0], color=decahose_color)
pdf_ovrl.iloc[0:plot_max:plot_interval]['Firehose'].plot(ax=axes[1], color=decahose_color)

cdf_ovrl.iloc[0:plot_max:plot_interval]['Firehose ex. Covid'].plot(ax=axes[0], color=decahose2_color)
pdf_ovrl.iloc[0:plot_max:plot_interval]['Firehose ex. Covid'].plot(ax=axes[1], color=decahose2_color)

axes[0].set_title('Pooled Empirical CDF')
axes[1].set_title('Pooled Empirical PDF')

axes[0].set_xlabel('Time')
axes[1].set_xlabel('Time')

axes[0].set_ylabel('Proportion')
axes[1].set_ylabel('Density')

axes[0].set_ylim(0, 1)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)
axes[0].xaxis.set_major_formatter(fmt)

fig.tight_layout()

print(f'Based on {mask.sum()} event(s)')

In [None]:
names = {'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}
names = {v: k for k, v in names.items()}

cdf_ovrl.rename(names, axis=1, inplace=True)
pdf_ovrl.rename(names, axis=1, inplace=True)

## Bootstrap test

In [None]:
n_sims = 10000

test_max = 2880
test_interval = 15

In [None]:
rng = np.random.default_rng(seed=seed)

### Resampling

In [None]:
samples, actuals = {}, {}
samp_masks = {}

for kind in stats['kind'].unique():
    samp_masks[kind] = mask & (stats['kind'] == kind)
samp_masks['decahose-no-covid-2020'] = mask & (stats['kind'] == 'decahose') & ~stats['decahose_covid_2020']

for kind in tqdm(samp_masks.keys()):
    samp_mask = samp_masks[kind]
    samp_cdf_vals = cdf_vals[samp_mask, :][:, :test_max:test_interval]
    actuals[kind] = cdf_vals[samp_mask, ...][:, :test_max:test_interval].mean(axis=0)

    tmp_samples = []
    for i in trange(n_sims):
        perm = rng.choice(samp_cdf_vals.shape[0], samp_cdf_vals.shape[0], replace=True)
        tmp_samples += [pd.Series(samp_cdf_vals[perm, ...].mean(axis=0)).rename('s' + str(i))]

    tmp_samples = pd.DataFrame(tmp_samples)
    tmp_samples.index = ['s' + str(i) for i in range(n_sims)]
    tmp_samples.columns = (tmp_samples.columns.to_series() * cdf_query_inc * test_interval)
    tmp_samples = tmp_samples.T
    
    samples[kind] = tmp_samples

In [None]:
# sanity check - should all be tightly clustered around 0
with pd.option_context('display.float_format', lambda x: '%.7f' % x, 'display.max_rows', None):
    display(pd.concat([
        (samples[kind].mean(axis=1) - actuals[kind]).describe().rename(kind)
        for kind in samples.keys()
    ], axis=1))

### Test results

In [None]:
(actuals['elite'] >= actuals['radio'])

In [None]:
(actuals['elite'] >= actuals['decahose-no-covid-2020'])

In [None]:
(actuals['decahose-no-covid-2020'] >= actuals['radio'])

In [None]:
comp = (samples['elite'] > actuals['radio'][:, None]).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

pvals.describe()

In [None]:
pvals.plot()

In [None]:
comp = (samples['elite'] > actuals['decahose'][:, None]).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

pvals.describe()

In [None]:
pvals.plot()

In [None]:
comp = (samples['elite'] > actuals['decahose-no-covid-2020'][:, None]).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

pvals.describe()

In [None]:
pvals.plot()

In [None]:
comp = (samples['decahose'] > actuals['radio'][:, None]).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

pvals.describe()

In [None]:
pvals.plot()

In [None]:
comp = (samples['decahose-no-covid-2020'] > actuals['radio'][:, None]).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

pvals.describe()

In [None]:
pvals.plot()