In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as ss
import statsmodels.formula.api as smf

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm, trange

from IPython.display import display

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

In [None]:
colors = sns.color_palette('bright', 10)

elite_color = colors[3]
radio_color = colors[0]
decahose_color = colors[9]
decahose2_color = colors[4]

## Load data

In [None]:
matching = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-matching.csv')

In [None]:
stats_overall = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index_col='story_id')
assert stats_overall.index.is_unique

stats_lib = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats-lib.csv', index_col='story_id')
assert stats_lib.index.is_unique

stats_con = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats-con.csv', index_col='story_id')
assert stats_con.index.is_unique

stats = pd.concat([stats_lib, stats_con], axis=0)

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs-lib.npy', 'rb') as f:
    cdf_vals_lib = np.load(f)

with open('data/paper-round-3/event-annotated/auto-story-cdfs-con.npy', 'rb') as f:
    cdf_vals_con = np.load(f)

## Quality filter

In [None]:
length_mask_overall = (stats_overall['count'] >= 20)
kind_mask_overall = (stats_overall['kind'] != 'decahose')

matching_mask_overall = stats_overall.index.isin(
    matching['story_id_elite'].tolist() +
    matching['story_id_radio'].tolist() +
    matching.loc[matching['story_id_decahose'].notna(), 'story_id_decahose'].tolist()
)

mask_overall = (
    length_mask_overall
    & kind_mask_overall
    & matching_mask_overall
)

mask_lib = mask_overall & (stats_lib['count'] >= 10)  # must be >= 2 to avoid nans in SDs
mask_con = mask_overall & (stats_con['count'] >= 10)

mask = pd.concat([mask_lib, mask_con], axis=0)

mask.sum(), stats.shape[0]

# Descriptive stats

In [None]:
stats.groupby(['conservative', 'kind'])['count'].sum()

In [None]:
stats.groupby(['conservative', 'year', 'kind'])['count'].sum()

In [None]:
stats.groupby(['conservative', 'year', 'kind'])['group'].nunique()

In [None]:
stats.loc[mask, :].groupby(['conservative', 'year', 'kind'])['group'].nunique()

In [None]:
stats['dur'].hist(by=stats['conservative'], bins=50, log=True)

In [None]:
stats.groupby([stats['conservative'], stats['dur'] < 86400 * 2]).size()

In [None]:
stats.loc[stats['count'] > 0, :].groupby(['conservative', 'year', 'kind'])['group'].nunique()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x):
    display(stats.loc[mask, :].describe())

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask, :].groupby(['kind', 'conservative']).describe().T)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask, :].groupby(['kind', 'conservative']).describe().T.loc['avg_abs', :].T.swaplevel().sort_index())

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(stats.loc[mask, :].groupby(['year', 'conservative', 'kind']).describe().T.loc['avg_abs', :].T)

# Summary metrics based on relative time

## Tests: Overall

In [None]:
tests = []
for c in stats.loc[mask, 'conservative'].unique():
    for metric in ['avg_abs', 'std']:
        el = stats.loc[stats['conservative'] == c]
        el = el.reindex(matching['story_id_elite'])
        
        rd = stats.loc[stats['conservative'] == c]
        rd = rd.reindex(matching['story_id_radio'])

        el = el[metric].to_numpy()
        rd = rd[metric].to_numpy()

        inc = ~np.isnan(el) & ~np.isnan(rd)
        stat = ss.ttest_rel(el[inc], rd[inc])
        tests += [{
            'metric': metric,
            'kind1': 'elite',
            'kind2': 'radio',
            'conservative': c,
            'statistic': stat.statistic,
            'pvalue': stat.pvalue,
        }]

tests = pd.DataFrame(tests) \
    .sort_values(['conservative', 'kind1', 'kind2'], ascending=False) \
    [['conservative', 'kind1', 'kind2', 'metric', 'statistic', 'pvalue']] \
    .reset_index(drop=True)

tests

## Tests: By year

In [None]:
tests = []

for year in stats.loc[mask, 'year'].unique():
    for c in stats.loc[mask, 'conservative'].unique():
        for metric in ['avg_abs', 'std']:
            el = stats.loc[(stats['year'] == year) & (stats['conservative'] == c)]
            el = el.reindex(matching['story_id_elite'])

            rd = stats.loc[(stats['year'] == year) & (stats['conservative'] == c)]
            rd = rd.reindex(matching['story_id_radio'])

            el = el[metric].to_numpy()
            rd = rd[metric].to_numpy()

            inc = ~np.isnan(el) & ~np.isnan(rd)
            stat = ss.ttest_rel(el[inc], rd[inc])
            tests += [{
                'year': year,
                'metric': metric,
                'kind1': 'elite',
                'kind2': 'radio',
                'conservative': c,
                'statistic': stat.statistic,
                'pvalue': stat.pvalue,
            }]

tests = pd.DataFrame(tests) \
    .sort_values(['year', 'conservative', 'kind1', 'kind2'], ascending=False) \
    [['year', 'conservative', 'kind1', 'kind2', 'metric', 'statistic', 'pvalue']] \
    .reset_index(drop=True)

tests

## Tests: Conservatives vs liberals

In [None]:
for kind in stats.loc[mask, 'kind'].unique():
    print(kind, ss.ttest_rel(
        stats_lib.loc[mask_lib & mask_con & (stats_lib['kind'] == kind), 'avg_abs'],
        stats_con.loc[mask_con & mask_lib & (stats_con['kind'] == kind), 'avg_abs']
    ))

In [None]:
for kind in stats.loc[mask, 'kind'].unique():
    print(kind, ss.ttest_rel(
        stats_lib.loc[mask_lib & mask_con & (stats_lib['kind'] == kind), 'std'],
        stats_con.loc[mask_con & mask_lib & (stats_con['kind'] == kind), 'std']
    ))

In [None]:
for year in stats.loc[mask, 'year'].unique():
    for kind in stats.loc[mask, 'kind'].unique():
        print(year, kind, ss.ttest_rel(
            stats_lib.loc[mask_lib & mask_con & (stats_lib['year'] == year) & (stats_lib['kind'] == kind), 'avg_abs'],
            stats_con.loc[mask_con & mask_lib & (stats_lib['year'] == year) & (stats_con['kind'] == kind), 'avg_abs']
        ))

In [None]:
for year in stats.loc[mask, 'year'].unique():
    for kind in stats.loc[mask, 'kind'].unique():
        print(year, kind, ss.ttest_rel(
            stats_lib.loc[mask_lib & mask_con & (stats_lib['year'] == year) & (stats_lib['kind'] == kind), 'std'],
            stats_con.loc[mask_con & mask_lib & (stats_lib['year'] == year) & (stats_con['kind'] == kind), 'std']
        ))

# Visualize

In [None]:
nbins = 20

fig, axes = plt.subplots(2, 3, figsize=(15, 10), sharey=True)
twins = np.asarray([
    [axes[0][0].twinx(), axes[0][1].twinx(), axes[0][2].twinx()],
    [axes[1][0].twinx(), axes[1][1].twinx(), axes[1][2].twinx()],
])

sns.histplot(
    stats_overall.loc[
        mask_overall
        & (stats_overall['kind'] == 'elite'),
    'avg'],
    ax=axes[0][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats_overall.loc[
        mask_overall
        & (stats_overall['kind'] == 'radio'),
    'avg'],
    ax=twins[0][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 0)
        & (stats['kind'] == 'elite'),
    'avg_abs'],
    ax=axes[0][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 0)
        & (stats['kind'] == 'radio'),
    'avg_abs'],
    ax=twins[0][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 1)
        & (stats['kind'] == 'elite'),
    'avg_abs'],
    ax=axes[0][2], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 1)
        & (stats['kind'] == 'radio'),
    'avg_abs'],
    ax=twins[0][2], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)

sns.histplot(
    stats_overall.loc[
        mask_overall
        & (stats_overall['kind'] == 'elite'),
    'std'],
    ax=axes[1][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats_overall.loc[
        mask_overall
        & (stats_overall['kind'] == 'radio'),
    'std'],
    ax=twins[1][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 0)
        & (stats['kind'] == 'elite'),
    'std'],
    ax=axes[1][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 0)
        & (stats['kind'] == 'radio'),
    'std'],
    ax=twins[1][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 1)
        & (stats['kind'] == 'elite'),
    'std'],
    ax=axes[1][2], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
)
sns.histplot(
    stats.loc[
        mask
        & (stats['conservative'] == 1)
        & (stats['kind'] == 'radio'),
    'std'],
    ax=twins[1][2], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
)

axes[0][0].set_title('Mean Overall', fontsize=16)
axes[0][1].set_title('Mean Liberal', fontsize=16)
axes[0][2].set_title('Mean Conservative', fontsize=16)

axes[1][0].set_title('SD Overall', fontsize=16)
axes[1][1].set_title('SD Liberal', fontsize=16)
axes[1][2].set_title('SD Conservative', fontsize=16)

for ax, twin in zip(axes.flatten(), twins.flatten()):
    ax.set_xlabel('Time', fontsize=14)
    ax.set_ylabel('Count', fontsize=14)
    twin.set_ylabel(None)
    
    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    ax.xaxis.set_major_formatter(fmt)

    ax.tick_params(axis='both', which='major', labelsize=13)
    
    h1, l1 = ax.get_legend_handles_labels()
    h2, l2 = twin.get_legend_handles_labels()
    ax.legend(h1 + h2, l1 + l2, loc=0, fontsize=14)

#
# t-tests
#

ovrl_test_mean = ss.ttest_ind(
    stats_overall.loc[mask_overall & (stats_overall['kind'] == 'elite'), 'avg'],
    stats_overall.loc[mask_overall & (stats_overall['kind'] == 'radio'), 'avg']
)

lib_test_mean = ss.ttest_ind(
    stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'elite'), 'avg_abs'],
    stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'radio'), 'avg_abs']
)

con_test_mean = ss.ttest_ind(
    stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'elite'), 'avg_abs'],
    stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'radio'), 'avg_abs']
)

ovrl_test_sd = ss.ttest_ind(
    stats_overall.loc[mask_overall & (stats_overall['kind'] == 'elite'), 'std'],
    stats_overall.loc[mask_overall & (stats_overall['kind'] == 'radio'), 'std']
)

lib_test_sd = ss.ttest_ind(
    stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'elite'), 'std'],
    stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'radio'), 'std']
)

con_test_sd = ss.ttest_ind(
    stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'elite'), 'std'],
    stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'radio'), 'std']
)

props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (ovrl_test_mean.statistic, ),
    r'$p = \rm{%.1e}$' % (ovrl_test_mean.pvalue, ),
))
axes[0][0].text(0.65, 0.65, textstr, transform=axes[0][0].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (lib_test_mean.statistic, ),
    r'$p = \rm{%.1e}$' % (lib_test_mean.pvalue, ),
))
axes[0][1].text(0.65, 0.65, textstr, transform=axes[0][1].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (con_test_mean.statistic, ),
    r'$p = \rm{%.1e}$' % (con_test_mean.pvalue, ),
))
axes[0][2].text(0.65, 0.65, textstr, transform=axes[0][2].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (ovrl_test_sd.statistic, ),
    r'$p = \rm{%.1e}$' % (ovrl_test_sd.pvalue, ),
))
axes[1][0].text(0.65, 0.65, textstr, transform=axes[1][0].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (lib_test_sd.statistic, ),
    r'$p = \rm{%.1e}$' % (lib_test_sd.pvalue, ),
))
axes[1][1].text(0.65, 0.65, textstr, transform=axes[1][1].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

textstr = '\n'.join((
    r'$H_0: \rm{\bar{T}} = \rm{\bar{R}}$',
    r'$t=%.2f$' % (con_test_sd.statistic, ),
    r'$p = \rm{%.1e}$' % (con_test_sd.pvalue, )
))
axes[1][2].text(0.65, 0.65, textstr, transform=axes[1][2].transAxes, fontsize=14,
             verticalalignment='top', bbox=props)    

fig.tight_layout()

In [None]:
nbins = 20

fig, axes = plt.subplots(2, 2, figsize=(10, 10))
twins = np.asarray([
    [axes[0][0].twinx(), axes[0][1].twinx()],
    [axes[1][0].twinx(), axes[1][1].twinx()],
])

sns.histplot(stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'elite'), 'avg_abs'], ax=axes[0][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'radio'), 'avg_abs'], ax=twins[0][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'elite'), 'std'], ax=axes[0][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 0) & (stats['kind'] == 'radio'), 'std'], ax=twins[0][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

sns.histplot(stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'elite'), 'avg_abs'], ax=axes[1][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'radio'), 'avg_abs'], ax=twins[1][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'elite'), 'std'], ax=axes[1][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins)
sns.histplot(stats.loc[mask & (stats['conservative'] == 1) & (stats['kind'] == 'radio'), 'std'], ax=twins[1][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins)

axes[0][0].set_title('Liberal: Mean Within-Story Time')
axes[0][1].set_title('Liberal: SD of Within-Story Time')
axes[1][0].set_title('Conservative: Mean Within-Story Time')
axes[1][1].set_title('Conservative: SD of Within-Story Time')

for ax, twin in zip(axes.flatten(), twins.flatten()):
    ax.set_xlabel('Time')
    ax.set_ylabel('Count')

    twin.set_ylabel(None)
    
    ax.xaxis.set_tick_params(rotation=45)

    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    ax.xaxis.set_major_formatter(fmt)

    h1, l1 = ax.get_legend_handles_labels()
    h2, l2 = twin.get_legend_handles_labels()
    ax.legend(h1 + h2, l1 + l2, loc=0)

fig.tight_layout()

In [None]:
nbins = 20

for year in stats.loc[mask, 'year'].unique():
    fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharey=True)
    twins = np.asarray([
        [axes[0][0].twinx(), axes[0][1].twinx()],
        [axes[1][0].twinx(), axes[1][1].twinx()],
    ])

    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 0)
            & (stats['kind'] == 'elite'),
        'avg_abs'],
        ax=axes[0][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 0)
            & (stats['kind'] == 'radio'),
        'avg_abs'],
        ax=twins[0][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 0)
            & (stats['kind'] == 'elite'),
        'std'],
        ax=axes[0][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 0)
            & (stats['kind'] == 'radio'),
        'std'],
        ax=twins[0][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
    )

    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 1)
            & (stats['kind'] == 'elite'),
        'avg_abs'],
        ax=axes[1][0], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 1)
            & (stats['kind'] == 'radio'),
        'avg_abs'],
        ax=twins[1][0], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 1)
            & (stats['kind'] == 'elite'),
        'std'],
        ax=axes[1][1], color=elite_color, alpha=0.3, label='Elite', kde=True, bins=nbins
    )
    sns.histplot(
        stats.loc[
            mask
            & (stats['year'] == year)
            & (stats['conservative'] == 1)
            & (stats['kind'] == 'radio'),
        'std'],
        ax=twins[1][1], color=radio_color, alpha=0.3, label='Radio', kde=True, bins=nbins
    )

    fig.suptitle(str(year))

    axes[0][0].set_title('Liberal: Mean Within-Story Time')
    axes[0][1].set_title('Liberal: SD of Within-Story Time')
    axes[1][0].set_title('Conservative: Mean Within-Story Time')
    axes[1][1].set_title('Conservative: SD of Within-Story Time')

    for ax, twin in zip(axes.flatten(), twins.flatten()):
        ax.set_xlabel('Time')
        ax.set_ylabel('Count')

        twin.set_ylabel(None)

        ax.xaxis.set_tick_params(rotation=45)

        fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
        ax.xaxis.set_major_formatter(fmt)

        h1, l1 = ax.get_legend_handles_labels()
        h2, l2 = twin.get_legend_handles_labels()
        ax.legend(h1 + h2, l1 + l2, loc=0)

    fig.tight_layout()

## Vincentized ecdf

And the epdf, which we get from the cdf via differentials (`np.gradient`).

In [None]:
# as in the 5a notebook that calculated these
cdf_query_end = 7 * 24 * 3600
cdf_query_inc = 60  # one minute; afterward we can sample to a higher frequency if desired

In [None]:
cdf_ovrl = pd.DataFrame([
    cdf_vals_lib[mask_lib & (stats_lib['kind'] == 'elite'), ...].mean(axis=0),
    cdf_vals_lib[mask_lib & (stats_lib['kind'] == 'radio'), ...].mean(axis=0),
    cdf_vals_con[mask_con & (stats_con['kind'] == 'elite'), ...].mean(axis=0),
    cdf_vals_con[mask_con & (stats_con['kind'] == 'radio'), ...].mean(axis=0),
], index=['lib-elite', 'lib-radio', 'con-elite', 'con-radio']).T
cdf_ovrl.index = (cdf_ovrl.index.to_series() * cdf_query_inc)
pdf_ovrl = cdf_ovrl.copy()
for c in list(pdf_ovrl):
    pdf_ovrl[c] = np.gradient(pdf_ovrl[c])

year_cdfs, year_pdfs = {}, {}
for year in stats['year'].unique():
    tmp_cdf = pd.DataFrame([
        cdf_vals_lib[mask_lib & (stats_lib['year'] == year) & (stats_lib['kind'] == 'elite'), ...].mean(axis=0),
        cdf_vals_lib[mask_lib & (stats_lib['year'] == year) & (stats_lib['kind'] == 'radio'), ...].mean(axis=0),
        cdf_vals_con[mask_con & (stats_con['year'] == year) & (stats_con['kind'] == 'elite'), ...].mean(axis=0),
        cdf_vals_con[mask_con & (stats_con['year'] == year) & (stats_con['kind'] == 'radio'), ...].mean(axis=0),
    ], index=['lib-elite', 'lib-radio', 'con-elite', 'con-radio']).T
    tmp_cdf.index = (tmp_cdf.index.to_series() * cdf_query_inc)
    year_cdfs[year] = tmp_cdf
    
    tmp_pdf = tmp_cdf.copy()
    for c in list(tmp_pdf):
        tmp_pdf[c] = np.gradient(tmp_pdf[c])
    year_pdfs[year] = tmp_pdf
    
cdf_ovrl.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}, axis=1, inplace=True)
pdf_ovrl.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}, axis=1, inplace=True)

### Plot

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 10), sharex=True)

# in units of cdf_query_inc
plot_max = 3 * 24 * 60
plot_interval = 15

cdf_ovrl[['lib-elite', 'lib-radio']] \
    .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Elite'] \
    .plot(ax=axes[0][0], color=elite_color)
pdf_ovrl[['lib-elite', 'lib-radio']] \
    .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Elite'] \
    .plot(ax=axes[0][1], color=elite_color)
cdf_ovrl[['con-elite', 'con-radio']] \
    .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Elite'] \
    .plot(ax=axes[1][0], color=elite_color)
pdf_ovrl[['con-elite', 'con-radio']] \
    .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Elite'] \
    .plot(ax=axes[1][1], color=elite_color)

cdf_ovrl[['lib-elite', 'lib-radio']] \
    .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Radio'] \
    .plot(ax=axes[0][0], color=radio_color)
pdf_ovrl[['lib-elite', 'lib-radio']] \
    .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Radio'] \
    .plot(ax=axes[0][1], color=radio_color)
cdf_ovrl[['con-elite', 'con-radio']] \
    .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Radio'] \
    .plot(ax=axes[1][0], color=radio_color)
pdf_ovrl[['con-elite', 'con-radio']] \
    .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
    .iloc[0:plot_max:plot_interval]['Radio'] \
    .plot(ax=axes[1][1], color=radio_color)

axes[0][0].set_title('Overall Liberal Pooled Empirical CDF')
axes[0][1].set_title('Overall Liberal Pooled Empirical PDF')
axes[1][0].set_title('Overall Conservative Pooled Empirical CDF')
axes[1][1].set_title('Overall Conservative Pooled Empirical PDF')

for i in range(axes.shape[0]):
    axes[i][0].set_xlabel('Time')
    axes[i][1].set_xlabel('Time')
    axes[i][0].set_ylabel('Proportion')
    axes[i][1].set_ylabel('Density')
    axes[i][0].set_ylim(0, 1)

    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    axes[i][0].xaxis.set_major_formatter(fmt)
    axes[i][1].xaxis.set_major_formatter(fmt)

fig.tight_layout()

print(f'Based on {mask.sum()} event(s)')

In [None]:
for year in year_cdfs.keys():
    fig, axes = plt.subplots(2, 2, figsize=(20, 10), sharex=True)

    # in units of cdf_query_inc
    plot_max = 3 * 24 * 60
    plot_interval = 15

    year_cdfs[year][['lib-elite', 'lib-radio']] \
        .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Elite'] \
        .plot(ax=axes[0][0], color=elite_color)
    year_pdfs[year][['lib-elite', 'lib-radio']] \
        .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Elite'] \
        .plot(ax=axes[0][1], color=elite_color)
    year_cdfs[year][['con-elite', 'con-radio']] \
        .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Elite'] \
        .plot(ax=axes[1][0], color=elite_color)
    year_pdfs[year][['con-elite', 'con-radio']] \
        .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Elite'] \
        .plot(ax=axes[1][1], color=elite_color)

    year_cdfs[year][['lib-elite', 'lib-radio']] \
        .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Radio'] \
        .plot(ax=axes[0][0], color=radio_color)
    year_pdfs[year][['lib-elite', 'lib-radio']] \
        .rename({'lib-elite': 'Elite', 'lib-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Radio'] \
        .plot(ax=axes[0][1], color=radio_color)
    year_cdfs[year][['con-elite', 'con-radio']] \
        .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Radio'] \
        .plot(ax=axes[1][0], color=radio_color)
    year_pdfs[year][['con-elite', 'con-radio']] \
        .rename({'con-elite': 'Elite', 'con-radio': 'Radio'}, axis=1) \
        .iloc[0:plot_max:plot_interval]['Radio'] \
        .plot(ax=axes[1][1], color=radio_color)

    axes[0][0].set_title(f'{year} Liberal Pooled Empirical CDF')
    axes[0][1].set_title(f'{year} Liberal Pooled Empirical PDF')
    axes[1][0].set_title(f'{year} Conservative Pooled Empirical CDF')
    axes[1][1].set_title(f'{year} Conservative Pooled Empirical PDF')

    for i in range(axes.shape[0]):
        axes[i][0].set_xlabel('Time')
        axes[i][1].set_xlabel('Time')
        axes[i][0].set_ylabel('Proportion')
        axes[i][1].set_ylabel('Density')
        axes[i][0].set_ylim(0, 1)

        fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
        axes[i][0].xaxis.set_major_formatter(fmt)
        axes[i][1].xaxis.set_major_formatter(fmt)

    fig.tight_layout()

    print(f'Based on {mask.sum()} event(s)')

In [None]:
names = {'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose', 'decahose-no-covid-2020': 'Firehose ex. Covid'}
names = {v: k for k, v in names.items()}

cdf_ovrl.rename(names, axis=1, inplace=True)
pdf_ovrl.rename(names, axis=1, inplace=True)

### Bootstrap tests

In [None]:
n_sims = 10000

test_max = 2880
test_interval = 15

In [None]:
rng = np.random.default_rng(seed=seed)

#### Liberal overall

In [None]:
tw_mask = mask_lib & (stats_lib['kind'] == 'elite')
tw_cdf_vals = cdf_vals_lib[tw_mask, :][:, :test_max:test_interval]
tw_actual = cdf_vals_lib[tw_mask, ...][:, :test_max:test_interval].mean(axis=0)

tw_samples = []
for i in trange(n_sims):
    perm = rng.choice(tw_cdf_vals.shape[0], tw_cdf_vals.shape[0], replace=True)
    tw_samples += [pd.Series(tw_cdf_vals[perm, ...].mean(axis=0)).rename('s' + str(i))]

tw_samples = pd.DataFrame(tw_samples)
tw_samples.index = ['s' + str(i) for i in range(n_sims)]
tw_samples.columns = (tw_samples.columns.to_series() * cdf_query_inc)
tw_samples = tw_samples.T

In [None]:
rd_mask = mask_lib & (stats_lib['kind'] == 'radio')
rd_cdf_vals = cdf_vals_lib[rd_mask, :][:, :test_max:test_interval]
rd_actual = cdf_vals_lib[rd_mask, ...][:, :test_max:test_interval].mean(axis=0)

rd_samples = []
for i in trange(n_sims):
    perm = rng.choice(rd_cdf_vals.shape[0], rd_cdf_vals.shape[0], replace=True)
    rd_samples += [pd.Series(rd_cdf_vals[perm, ...].mean(axis=0))]

rd_samples = pd.DataFrame(rd_samples)
rd_samples.index = ['s' + str(i) for i in range(n_sims)]
rd_samples.columns = (rd_samples.columns.to_series() * cdf_query_inc)
rd_samples = rd_samples.T

In [None]:
# sanity check
with pd.option_context('display.float_format', lambda x: '%.7f' % x, 'display.max_rows', None):
    display(pd.concat([
        (tw_samples.mean(axis=1) - tw_actual).describe().rename('elite'),
        (rd_samples.mean(axis=1) - rd_actual).describe().rename('radio'),
    ], axis=1))

In [None]:
comp = (tw_samples > rd_samples).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

In [None]:
pvals.describe()

In [None]:
pvals.value_counts().sort_index()

In [None]:
(pvals < 0.05).sum(), (pvals < 0.01).sum(), (pvals < 0.001).sum(), (pvals < 0.0001).sum()

In [None]:
pvals.plot()

#### Conservative overall

In [None]:
tw_mask = mask_con & (stats_con['kind'] == 'elite')
tw_cdf_vals = cdf_vals_con[tw_mask, :][:, :test_max:test_interval]
tw_actual = cdf_vals_con[tw_mask, ...][:, :test_max:test_interval].mean(axis=0)

tw_samples = []
for i in trange(n_sims):
    perm = rng.choice(tw_cdf_vals.shape[0], tw_cdf_vals.shape[0], replace=True)
    tw_samples += [pd.Series(tw_cdf_vals[perm, ...].mean(axis=0)).rename('s' + str(i))]

tw_samples = pd.DataFrame(tw_samples)
tw_samples.index = ['s' + str(i) for i in range(n_sims)]
tw_samples.columns = (tw_samples.columns.to_series() * cdf_query_inc)
tw_samples = tw_samples.T

In [None]:
rd_mask = mask_con & (stats_con['kind'] == 'radio')
rd_cdf_vals = cdf_vals_con[rd_mask, :][:, :test_max:test_interval]
rd_actual = cdf_vals_con[rd_mask, ...][:, :test_max:test_interval].mean(axis=0)

rd_samples = []
for i in trange(n_sims):
    perm = rng.choice(rd_cdf_vals.shape[0], rd_cdf_vals.shape[0], replace=True)
    rd_samples += [pd.Series(rd_cdf_vals[perm, ...].mean(axis=0))]

rd_samples = pd.DataFrame(rd_samples)
rd_samples.index = ['s' + str(i) for i in range(n_sims)]
rd_samples.columns = (rd_samples.columns.to_series() * cdf_query_inc)
rd_samples = rd_samples.T

In [None]:
# sanity check
with pd.option_context('display.float_format', lambda x: '%.7f' % x, 'display.max_rows', None):
    display(pd.concat([
        (tw_samples.mean(axis=1) - tw_actual).describe().rename('elite'),
        (rd_samples.mean(axis=1) - rd_actual).describe().rename('radio'),
    ], axis=1))

In [None]:
comp = (tw_samples > rd_samples).mean(axis=1)
pvals = comp.apply(lambda pval: 2 * min(pval, 1 - pval))

In [None]:
pvals.describe()

In [None]:
pvals.value_counts().sort_index()

In [None]:
(pvals < 0.05).sum(), (pvals < 0.01).sum(), (pvals < 0.001).sum(), (pvals < 0.0001).sum()

In [None]:
pvals.plot()