In [None]:
import os
import gzip
import random
import logging
import itertools as it
import multiprocessing

import numpy as np
import pandas as pd
import scipy.stats as ss
import statsmodels.formula.api as smf
import statsmodels.stats.diagnostic as dg

import seaborn as sns
import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

from IPython.display import display

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
colors = sns.color_palette('colorblind', 10)

elite_color = colors[3]
radio_color = colors[0]
decahose_color = colors[1]
decahose2_color = colors[2]

# Prep data

## Load

In [None]:
labels = [
    'negative',
    'emotional',
    'outraged',
]

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-qualitative-scores.csv.gz', 'rt') as f:
    scores = pd.read_csv(f, sep='\t', index_col='id')

scores.shape

In [None]:
stats = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index_col='story_id')
selected = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv', index_col='story_id')

assert stats.index.is_unique
assert (stats.groupby(['year', 'kind']).size() == stats.groupby(['year', 'kind'])['group'].nunique()).all()
assert selected.index.is_unique

covid_threshold = 0.1
selected['decahose_covid_2020'] = (selected['year'] == 2020) & (selected['kind'] == 'decahose') & (selected['covid'] >= covid_threshold)
stats['decahose_covid_2020'] = selected['decahose_covid_2020']
stats['decahose_covid_2020'] = stats['decahose_covid_2020'].fillna(False)

scores['decahose_covid_2020'] = scores['story_id'].isin(stats.loc[stats['decahose_covid_2020']].index)

del stats, selected

## Format scores

In [None]:
scores['frac_reltime'] = scores['reltime']

scores = scores.merge(
    scores \
        .groupby('story_id')['reltime'] \
        .min() \
        .reset_index() \
        .rename({'reltime': 'min_reltime'}, axis=1),
    how='inner',
    on='story_id'
)
scores['frac_reltime'] -= scores['min_reltime']

scores['ws_reltime'] = scores['frac_reltime']

scores = scores.merge(
    scores \
        .groupby('story_id')['frac_reltime'] \
        .max() \
        .reset_index() \
        .rename({'frac_reltime': 'max_reltime'}, axis=1),
    how='inner',
    on='story_id'
)
scores['frac_reltime'] /= scores['max_reltime']

scores = scores.drop(['min_reltime', 'max_reltime'], axis=1)

In [None]:
scores = scores.merge(
    scores \
        .groupby('story_id') \
        .size() \
        .reset_index() \
        .rename({0: 'n_items'}, axis=1),
    how='inner',
    on='story_id'
)

In [None]:
story_level = scores.groupby('story_id').mean(numeric_only=True).reset_index().drop('reltime', axis=1)
story_level.rename({'frac_reltime': 'mean_frac_reltime', 'ws_reltime': 'mean_ws_reltime'}, axis=1, inplace=True)
story_level['kind'] = story_level['story_id'].str.split('-').apply(lambda s: s[1])

story_level.set_index('story_id', inplace=True)
story_level['std_frac_reltime'] = scores.groupby('story_id')['frac_reltime'].std()
story_level['std_ws_reltime'] = scores.groupby('story_id')['ws_reltime'].std()
story_level['duration'] = scores.groupby('story_id')['reltime'].max() - scores.groupby('story_id')['reltime'].min()
story_level.reset_index(inplace=True)

# no decahose/firehose confusion
assert set(scores['kind'].unique()) == set(story_level['kind'].unique())

In [None]:
scores = scores.merge(
    story_level[['story_id', 'duration']],
    how='inner',
    on='story_id'
)

In [None]:
for label in labels:
    scores['logodds_' + label] = np.log(scores[label] / (1 - scores[label]))
    story_level['logodds_' + label] = np.log(story_level[label] / (1 - story_level[label]))

In [None]:
kind_median = scores.groupby('kind')['ws_reltime'].transform('median')
scores['ws_reltime_quad'] = (scores['ws_reltime'] - kind_median) ** 2
scores['frac_reltime_quad'] = (scores['frac_reltime'] - kind_median) ** 2

# Analysis 1: Simple medium differences

## Item-level

In [None]:
scores.groupby('kind')[labels].mean().T

In [None]:
scores[labels + ['kind']].groupby('kind').describe().T

In [None]:
kinds = scores['kind'].unique()

ttest_res = {}
for kind1 in kinds:
    for kind2 in kinds:
        if kind1 <= kind2:
            continue
        
        for label in labels:
            ttest_res[(kind1, kind2, label)] = ss.mannwhitneyu(
                scores.loc[scores['kind'] == kind1, label],
                scores.loc[scores['kind'] == kind2, label],
                
                alternative='two-sided',
                nan_policy='raise',
            )
ttest_res = pd.DataFrame([
    {'kind1': k1, 'kind2': k2, 'label': l, 'statistic': test.statistic, 'pvalue': test.pvalue}
    for (k1, k2, l), test in ttest_res.items()
])

ttest_res.sort_values(['label', 'kind1'])

In [None]:
with pd.option_context('display.float_format', '{:,.10f}'.format):
    display(ttest_res['pvalue'].describe())

In [None]:
alpha = 1 - ss.norm(0, 1).cdf(5)
(ttest_res['pvalue'] <= alpha).all()

In [None]:
# Test differences controlling for kind and year
for label in labels:
    print(smf.ols(f'logodds_{label} ~ kind + C(year)', data=scores).fit().summary())
    print('\n')

In [None]:
alpha = 0.05

tmp = scores.copy()
tmp.loc[tmp['kind'] == 'decahose', 'kind'] = 'firehose'

means = tmp.groupby('kind')[labels].mean().T
means.columns = [c.title() for c in means.columns]
means.index = [c.title() for c in means.index]
means = means[['Firehose', 'Elite', 'Radio']]

sems = tmp.groupby('kind')[labels].sem().T
sems.columns = [c.title() for c in sems.columns]
sems.index = [c.title() for c in sems.index]
sems = sems[['Firehose', 'Elite', 'Radio']]

# calculate the t-distribution multiplier for the given alpha
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

t_multiplier = ss.t.ppf(1 - alpha / 2, tmp.shape[0] - 1)

# means.plot(kind='bar', yerr=t_multiplier*sems, ax=ax, capsize=4, rot=0)

x = np.arange(len(labels))
width = 0.2  # Width of the bars
ax.bar(x - width, means['Firehose'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Firehose', color=decahose2_color, hatch='/')
ax.bar(x, means['Elite'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Elite', color=elite_color, hatch='-')
ax.bar(x + width, means['Radio'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Radio', color=radio_color, hatch='\\')

ax.set_xticks([0, 1, 2])
ax.set_xticklabels(means.index.tolist())

ax.set_ylim(0.2, 0.8)

ax.tick_params(axis='both', which='major', labelsize=12)
ax.set_xlabel('Affect metric', fontsize=14, labelpad=12)
ax.set_ylabel('Average probability', fontsize=14)

ax.legend(fontsize=14, loc='upper left')

fig.tight_layout()

## Story-level

In [None]:
story_level.groupby('kind')[labels].mean().T

In [None]:
story_level[labels + ['kind']].groupby('kind').describe().T

In [None]:
kinds = story_level['kind'].unique()

ttest_res = {}
for kind1 in kinds:
    for kind2 in kinds:
        if kind1 <= kind2:
            continue
        
        for label in labels:
            ttest_res[(kind1, kind2, label)] = ss.mannwhitneyu(
                story_level.loc[story_level['kind'] == kind1, label],
                story_level.loc[story_level['kind'] == kind2, label],
                alternative='two-sided',
                nan_policy='raise',
            )
ttest_res = pd.DataFrame([
    {'kind1': k1, 'kind2': k2, 'label': l, 'statistic': test.statistic, 'pvalue': test.pvalue}
    for (k1, k2, l), test in ttest_res.items()
])

ttest_res.sort_values(['label', 'kind1'])

In [None]:
with pd.option_context('display.float_format', '{:,.10f}'.format):
    display(ttest_res['pvalue'].describe())

In [None]:
alpha = 1 - ss.norm(0, 1).cdf(5)
(ttest_res['pvalue'] <= alpha).all()

In [None]:
# Test differences controlling for kind and year
for label in labels:
    print(smf.ols(f'logodds_{label} ~ kind + C(year)', data=story_level).fit().summary())
    print('\n')

In [None]:
alpha = 0.05

tmp = story_level.copy()
tmp.loc[tmp['kind'] == 'decahose', 'kind'] = 'firehose'

means = tmp.groupby('kind')[labels].mean().T
means.columns = [c.title() for c in means.columns]
means.index = [c.title() for c in means.index]
means = means[['Firehose', 'Elite', 'Radio']]

sems = tmp.groupby('kind')[labels].sem().T
sems.columns = [c.title() for c in sems.columns]
sems.index = [c.title() for c in sems.index]
sems = sems[['Firehose', 'Elite', 'Radio']]

# calculate the t-distribution multiplier for the given alpha
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

t_multiplier = ss.t.ppf(1 - alpha / 2, tmp.shape[0] - 1)

# means.plot(kind='bar', yerr=t_multiplier*sems, ax=ax, capsize=4, rot=0)

x = np.arange(len(labels))
width = 0.2  # Width of the bars
ax.bar(x - width, means['Firehose'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Firehose', color=decahose2_color, hatch='/')
ax.bar(x, means['Elite'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Elite', color=elite_color, hatch='-')
ax.bar(x + width, means['Radio'], width, yerr=t_multiplier*sems.iloc[0],
       capsize=4, label='Radio', color=radio_color, hatch='\\')

ax.set_xticks([0, 1, 2])
ax.set_ylim(0.2, 0.8)

ax.set_xticklabels(means.index.tolist(), fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=16)

ax.set_xlabel('Affect metric', fontsize=18, labelpad=18)
ax.set_ylabel('Average probability', fontsize=18)

ax.legend(fontsize=16, loc=(0.16, 0.86))

fig.tight_layout()

### Table for paper

In [None]:
tmp = story_level \
    .groupby('kind') \
    [['negative', 'emotional', 'outraged']] \
    .mean() \
    .T \
    .rename({'elite': 'Elite Twitter', 'decahose': 'Firehose'}, axis=1)

tmp.index.name = None
tmp.columns.name = None

tmp.index = pd.Index([c.title() for c in tmp.index])
tmp.columns = pd.Index([c.title() for c in tmp.columns])

print(tmp.style.format(precision=3).to_latex(
    hrules=True,
    position='ht',
    label='tab:affect-metrics',
))

# Analysis 2: Within a story, is there a time trend of affect?

Or is this about story fixed effects?

In [None]:
eligible = scores

In [None]:
comps = ['frac_reltime', 'ws_reltime']

In [None]:
eligible \
    [labels + comps] \
    .corr(method='pearson', numeric_only=True) \
    .loc[pd.IndexSlice[comps + labels], labels]

In [None]:
eligible \
    [labels + comps] \
    .corr(method='spearman', numeric_only=True) \
    .loc[pd.IndexSlice[comps + labels], labels]

In [None]:
eligible \
    .groupby('kind') \
    [labels + comps] \
    .corr(method='pearson', numeric_only=True) \
    .loc[pd.IndexSlice[:, labels + comps], labels]

In [None]:
eligible \
    .groupby('kind') \
    [labels + comps] \
    .corr(method='spearman', numeric_only=True) \
    .loc[pd.IndexSlice[:, labels + comps], labels]

In [None]:
tmp = eligible.sample(2000)

fig, axes = plt.subplots(3, 3, figsize=(15, 15))

axes[0][0].scatter(tmp['frac_reltime'], tmp['negative'])
axes[0][0].set_title('negative')
axes[0][0].set_ylabel('negative')
axes[0][0].set_xlabel('frac_reltime')

axes[0][1].scatter(tmp['frac_reltime'], tmp['emotional'])
axes[0][1].set_title('emotional')
axes[0][1].set_ylabel('emotional')
axes[0][1].set_xlabel('frac_reltime')

axes[0][2].scatter(tmp['frac_reltime'], tmp['outraged'])
axes[0][2].set_title('outraged')
axes[0][2].set_ylabel('outraged')
axes[0][2].set_xlabel('frac_reltime')

axes[1][0].scatter(tmp['ws_reltime'], tmp['negative'])
axes[1][0].set_title('negative')
axes[1][0].set_ylabel('negative')
axes[1][0].set_xlabel('ws_reltime')

axes[1][1].scatter(tmp['ws_reltime'], tmp['emotional'])
axes[1][1].set_title('emotional')
axes[1][1].set_ylabel('emotional')
axes[1][1].set_xlabel('ws_reltime')

axes[1][2].scatter(tmp['ws_reltime'], tmp['outraged'])
axes[1][2].set_title('outraged')
axes[1][2].set_ylabel('outraged')
axes[1][2].set_xlabel('ws_reltime')

axes[2][0].scatter(np.log(tmp['ws_reltime'] + 1e-6), tmp['negative'])
axes[2][0].set_title('negative')
axes[2][0].set_ylabel('negative')
axes[2][0].set_xlabel('log(ws_reltime)')

axes[2][1].scatter(np.log(tmp['ws_reltime'] + 1e-6), tmp['emotional'])
axes[2][1].set_title('emotional')
axes[2][1].set_ylabel('emotional')
axes[2][1].set_xlabel('log(ws_reltime)')

axes[2][2].scatter(np.log(tmp['ws_reltime'] + 1e-6), tmp['outraged'])
axes[2][2].set_title('outraged')
axes[2][2].set_ylabel('outraged')
axes[2][2].set_xlabel('log(ws_reltime)')

fig.tight_layout()

In [None]:
for dv in labels:
    for bin_var in ['ws_reltime', 'frac_reltime']:
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        for kind, ax in zip(sorted(eligible['kind'].unique()), axes):
            kind_data = eligible.loc[(eligible['kind'] == kind)]

            grp = pd.qcut(kind_data[bin_var], 5)

            tmp = pd.concat([
                pd.Series(kind_data.groupby(grp).size()).rename('n_items'),
                pd.Series(kind_data.groupby(grp)['story_id'].nunique()).rename('n_active'),
                pd.Series(kind_data.groupby(grp)[dv].mean()).rename(dv),
            ], axis=1)

            tmp[dv].plot(ax=ax, kind='bar', rot=45)

            ax.set_title(kind)
            ax.axhline(tmp[dv].mean(), color='red', linestyle='--')
            
            fig.suptitle(dv + ' | ' + bin_var)

## Plot sentiment over time

In [None]:
def plot_for_kind(kind_data, ax, twin, grp, ax_cols=None, twin_cols=None, legend=True, legend_loc='best',
                  time_ticks=True):
    all_cols = ['n_items', 'active_events', 'negative', 'emotional', 'outraged']
    assert (ax_cols is None and twin_cols is None) or (ax_cols is not None and twin_cols is not None)
    if ax_cols is None:
        ax_cols = ['active_events']
        twin_cols = ['outraged', 'negative']
    assert set(ax_cols) <= set(all_cols)
    assert set(twin_cols) <= set(all_cols)
    
    tmp = pd.concat([
        pd.Series(kind_data.groupby(grp).size()).rename('n_items'),
        pd.Series(kind_data.groupby(grp)['story_id'].nunique()).rename('active_events'),
        pd.Series(kind_data.groupby(grp)['negative'].mean()).rename('negative'),
        pd.Series(kind_data.groupby(grp)['emotional'].mean()).rename('emotional'),
        pd.Series(kind_data.groupby(grp)['outraged'].mean()).rename('outraged'),
    ], axis=1)

    tmp['midpt'] = [(i.left + i.right) / 2 for i in tmp.index]
    tmp['midpt_hours'] = tmp['midpt'] / 3600

    ax_colors = mp.cm.tab10(range(len(ax_cols)))
    twin_colors = mp.cm.tab10(range(len(ax_cols), len(twin_cols) + len(ax_cols)))

    line_styles = list(mp.lines.Line2D.lineStyles.keys())
    twin_line_styles = line_styles[0:len(twin_cols)]
    ax_line_styles = line_styles[len(twin_cols):len(twin_cols) + len(ax_cols)]
    
    xvals = tmp['midpt_hours'] if time_ticks else np.arange(tmp.shape[0])
    
    lines = []
    
    for col, color, style in zip(twin_cols, twin_colors, twin_line_styles):
        lines += [twin.plot(xvals, tmp[col], label=col.replace('_', ' ').title(),
                            color=color, linestyle=style)[0]]
    
    for col, color, style in zip(ax_cols, ax_colors, ax_line_styles):
        lines += [ax.plot(xvals, tmp[col], label=col.replace('_', ' ').title(),
                          color=color, linestyle=style)[0]]
    
    if time_ticks:
        n_ticks = 8
        tick_locs = np.linspace(min(tmp['midpt_hours']), max(tmp['midpt_hours']), n_ticks)
        ax.set_xticks(tick_locs)
        ax.set_xticklabels([f"{x:.0f}h" for x in tick_locs])

    ax.set_xlabel('Time' if time_ticks else 'N-tile of Time')
    ax.set_ylabel('Active Stories')
    twin.set_ylabel('Score', rotation=270, labelpad=12)

    if legend:
        ax.legend(lines, [l.get_label() for l in lines], loc=legend_loc)
        
    return tmp

### Negative and outraged

In [None]:
kinds = eligible['kind'].unique()

fig, axes = plt.subplots(ncols=len(kinds), figsize=(5*len(kinds), 5))
axes = [axes] if len(axes) == 1 else axes
legend_locs = [(0.15, 0.8), None, None]

for ax, kind, loc in zip(axes, kinds, legend_locs):
    kind_data = eligible.loc[(eligible['kind'] == kind)]
    twin = ax.twinx()
    
    plot_for_kind(
        kind_data,
        ax, twin,
        pd.cut(kind_data['ws_reltime'], 50),
        legend=(loc is not None), legend_loc=loc,
        time_ticks=True,
    )
    
    if kind == 'decahose':
        title = 'Firehose'
    else:
        title = kind

    ax.set_title(title.title(), fontsize=14)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)
    ax.set_xlabel(ax.get_xlabel(), fontsize=12)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)
    ax.set_ylabel(ax.get_ylabel(), fontsize=12)
    twin.set_yticklabels(twin.get_yticklabels(), fontsize=12)
    twin.set_ylabel(twin.get_ylabel(), fontsize=12)
    
    legend = ax.get_legend()
    if legend is not None:
        for text in legend.get_texts():
            text.set_fontsize(12)

fig.tight_layout()
plt.show()

In [None]:
kinds = eligible['kind'].unique().tolist() + ['decahose-nc']

fig, axes = plt.subplots(nrows=1, ncols=len(kinds), figsize=(5*len(kinds), 5))
# fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

axes = [axes] if len(axes) == 1 else axes.flatten()
legend_locs = [(0.15, 0.8), None, None, None]

for ax, kind, loc in zip(axes, kinds, legend_locs):
    if kind != 'decahose-nc':
        kind_data = eligible.loc[(eligible['kind'] == kind)]
    else:
        kind_data = eligible.loc[(eligible['kind'] == 'decahose') & (~eligible['decahose_covid_2020'])]

    twin = ax.twinx()
    
    plot_for_kind(
        kind_data,
        ax, twin,
        pd.cut(kind_data['ws_reltime'], 50),
        legend=(loc is not None), legend_loc=loc,
        time_ticks=True,
    )

    if kind == 'decahose':
        title = 'Firehose'
    elif kind == 'decahose-nc':
        title = 'Firehose ex. Covid 2020'
    else:
        title = kind
    
    ax.set_title(title.title(), fontsize=14)
    ax.set_xticklabels(ax.get_xticklabels(), fontsize=12)
    ax.set_xlabel(ax.get_xlabel(), fontsize=12)
    ax.set_yticklabels(ax.get_yticklabels(), fontsize=12)
    ax.set_ylabel(ax.get_ylabel(), fontsize=12)
    twin.set_yticklabels(twin.get_yticklabels(), fontsize=12)
    twin.set_ylabel(twin.get_ylabel(), fontsize=12)
    
    legend = ax.get_legend()
    if legend is not None:
        for text in legend.get_texts():
            text.set_fontsize(12)

fig.tight_layout()
plt.show()

### Emotional

In [None]:
kinds = eligible['kind'].unique()

fig, axes = plt.subplots(ncols=len(kinds), figsize=(5*len(kinds), 5))
axes = [axes] if len(axes) == 1 else axes
legend_locs = [(0.3, 0.85), None, None]

for ax, kind, loc in zip(axes, kinds, legend_locs):
    kind_data = eligible.loc[(eligible['kind'] == kind)]
    twin = ax.twinx()
    
    plot_for_kind(
        kind_data,
        ax, twin,
        pd.cut(kind_data['ws_reltime'], 50),
        ax_cols=['active_events'],
        twin_cols=['emotional'],
        legend=(loc is not None), legend_loc=loc,
        time_ticks=True,
    )
    
    ax.set_title(kind.title() if kind != 'decahose' else 'Firehose')

fig.tight_layout()
plt.show()

## Stats: How many high-outrage tail items / stories are there?

In [None]:
eligible.shape[0]

In [None]:
eligible.groupby(
        ((eligible['kind'] == 'elite') & (eligible['reltime'] <= 3600 * 24)) |
        ((eligible['kind'] != 'elite') & (eligible['reltime'] <= 3600 * 48))
).size()

In [None]:
eligible['story_id'].nunique()

In [None]:
eligible.groupby(
        ((eligible['kind'] == 'elite') & (eligible['reltime'] <= 3600 * 24)) |
        ((eligible['kind'] != 'elite') & (eligible['reltime'] <= 3600 * 48))
)['story_id'].nunique()

In [None]:
eligible.groupby(
        ((eligible['kind'] == 'elite') & (eligible['reltime'] <= 3600 * 24)) |
        ((eligible['kind'] != 'elite') & (eligible['reltime'] <= 3600 * 48))
)[labels].describe().T

## Models: Without story FEs

### Utils

In [None]:
def standardize(dat, cols):
    if isinstance(cols, str):
        cols = [cols]
    
    tmp = dat.copy()
    
    for col in cols:
        tmp[col] = (tmp[col] - tmp[col].mean()) / tmp[col].std()
    
    return tmp

In [None]:
def assemble_results(mods, thresh=None, bonferroni=True):
    if thresh is None:
        thresh = 1 - ss.norm(0, 1).cdf(5)
    
    stats = []
    
    for (kind, pred_var, dv), obj in mods.items():
        if not hasattr(obj, '__len__'):
            mod = obj
            results = obj
        elif len(obj) == 1:
            mod = obj[0]
            results = obj[0]
        else:
            mod, results = obj
        
        if isinstance(pred_var, str):
            pred_var = [pred_var]
        
        term_name_to_index = {name: idx for idx, name in enumerate(mod.model.exog_names)}

        for pv in pred_var:
            coef = results.params[term_name_to_index[pv]]
            pval = results.pvalues[term_name_to_index[pv]]
            tstat = results.tvalues[term_name_to_index[pv]]
            stderr = results.bse[term_name_to_index[pv]]

            stats += [{
                'dv': dv,
                'pred_var': pv,
                'kind': kind,
                'coef': coef,
                'pval': pval,
                'tstat': tstat,
                'stderr': stderr,
                'r2': mod.rsquared,
                'cond': mod.condition_number,
            }]

    stats = pd.DataFrame(stats).set_index(['pred_var', 'dv', 'kind']).sort_index()

    denom = stats.shape[0] if bonferroni else 1
    stats['signif'] = stats['pval'] <= (thresh / denom)

    return stats

In [None]:
def pval_fmt(s, thresh=None):
    five_sigma = 1 - ss.norm(0, 1).cdf(5)
    
    if thresh is None:
        thresh = five_sigma
    
    if s > thresh:
        val = '{0:.2f}'.format(s)
        return '= ' + val
    else:
        if thresh == five_sigma:
            val = r'\hspace{0.35em} p_{5\sigma}'
        else:
            val = '{0:.3f}'.format(thresh)
        return '< ' + val

In [None]:
def format_disp(coef, pval, signif, stderr=None):
    coef_fmt = '{0:.3f}'.format(coef)
    pval_str = pval_fmt(pval)

    if signif:
        coef_fmt = '\\textbf{' + coef_fmt + '}'
    
    return (
        coef_fmt
        + (' (' + '{0:.3f}'.format(stderr) + ', ' if stderr is not None else '')
        + '$p ' + pval_str + '$)'
    )

### Main effects

In [None]:
nofe_main_score_mods = {}

pred_vars = (
    'ws_reltime',
)

for kind in eligible['kind'].unique():
    tmp = eligible.loc[(eligible['kind'] == kind)]

    for dv in labels:
        for pred_var in pred_vars:
            mod = smf.ols(
                f'logodds_{dv} ~ {pred_var}',
                data=standardize(tmp, pred_vars),
            ).fit()
            
            nofe_main_score_mods[(kind, pred_var, dv)] = mod

nofe_main_stats = assemble_results(nofe_main_score_mods)
print(nofe_main_stats.to_string(formatters={'pval':'{:,.10f}'.format}))

### Inter-medium differences

In [None]:
nofe_inter_score_mods = {}

pred_vars = (
    'ws_reltime',
)

for dv in labels:
    for pred_var in pred_vars:
        for kind1, kind2 in it.combinations(eligible['kind'].unique(), 2):
            tmp = eligible.loc[(eligible['kind'] == kind1) | (eligible['kind'] == kind2)]

            mod = smf.ols(
                f'logodds_{dv} ~ {pred_var} * kind',
                data=standardize(tmp, pred_vars),
            ).fit()

            pvs = mod.params[mod.params.index != 'Intercept'].index.tolist()
            pvs = tuple([p for p in pvs if p.startswith(pred_var + ':kind')])
            nofe_inter_score_mods[((kind1, kind2), pvs, dv)] = mod

nofe_inter_stats = assemble_results(nofe_inter_score_mods)
print(nofe_inter_stats.to_string(formatters={'pval':'{:,.10f}'.format}))

## With story FEs

### Main effects

In [None]:
def fit_main_model(params):
    kind, dv, pred_var, tmp_standardized, tmp = params
    mod = smf.ols(
        f'logodds_{dv} ~ {pred_var} + C(story_id)',
        data=tmp_standardized
    ).fit()

    results = mod.get_robustcov_results(cov_type='cluster', groups=tmp['story_id'])
    return (kind, pred_var, dv, mod, results)

fe_main_score_mods = {}
pred_vars = (
    'ws_reltime',
)

inputs = []
for kind in eligible['kind'].unique():
    tmp = eligible.loc[(eligible['kind'] == kind)]
    tmp_standardized = standardize(tmp, pred_vars)
    
    for dv in labels:
        for pred_var in pred_vars:
            inputs.append((kind, dv, pred_var, tmp_standardized, tmp))

n_cpus = min(multiprocessing.cpu_count(), len(inputs))

results = []
with multiprocessing.Pool(n_cpus) as pool:
    pbar = tqdm(total=len(inputs))
    result_objects = [pool.apply_async(fit_main_model, (param, ), callback=lambda _: pbar.update()) for param in inputs]

    for r in result_objects:
        results.append(r.get())

    pbar.close()

for kind, pred_var, dv, mod, result in results:
    fe_main_score_mods[(kind, pred_var, dv)] = (mod, result)

fe_main_stats = assemble_results(fe_main_score_mods)
print(fe_main_stats.to_string(formatters={'pval':'{:,.10f}'.format}))

### Inter-medium differences

In [None]:
def fit_inter_model(params):
    kind1, kind2, dv, pred_var, tmp_standardized, tmp = params
    
    mod = smf.ols(
        f'logodds_{dv} ~ {pred_var} * kind + C(story_id)',
        data=tmp_standardized
    ).fit()

    results = mod.get_robustcov_results(cov_type='cluster', groups=tmp['story_id'])

    pvs = mod.params[mod.params.index != 'Intercept'].index.tolist()
    pvs = tuple([p for p in pvs if p.startswith(pred_var + ':kind')])
    return ((kind1, kind2), pvs, dv, mod, results)

fe_inter_score_mods = {}
pred_vars = (
    'ws_reltime',
)

inputs = []
for kind1, kind2 in it.combinations(eligible['kind'].unique(), 2):
    tmp = eligible.loc[(eligible['kind'] == kind1) | (eligible['kind'] == kind2)]
    tmp_standardized = standardize(tmp, pred_vars)
    
    for dv in labels:
        for pred_var in pred_vars:
            inputs.append((kind1, kind2, dv, pred_var, tmp_standardized, tmp))

n_cpus = min(multiprocessing.cpu_count(), len(inputs))


results = []
with multiprocessing.Pool(n_cpus) as pool:
    pbar = tqdm(total=len(inputs))
    result_objects = [pool.apply_async(fit_inter_model, (param, ), callback=lambda _: pbar.update()) for param in inputs]
    
    for r in result_objects:
        results.append(r.get())

    pbar.close()

for (kind1, kind2), pvs, dv, mod, result in results:
    fe_inter_score_mods[((kind1, kind2), pvs, dv)] = (mod, result)

fe_inter_stats = assemble_results(fe_inter_score_mods)
print(fe_inter_stats.to_string(formatters={'pval':'{:,.10f}'.format}))

### Tables for paper

#### Main effects

In [None]:
main_stats = pd.concat([
    nofe_main_stats \
        .loc[pd.IndexSlice[:, labels, :]] \
        .assign(fe=False) \
        .drop(['cond', 'r2'], axis=1) \
        .reset_index(),
    
    fe_main_stats \
        .loc[pd.IndexSlice[:, labels, :]] \
        .assign(fe=True) \
        .drop(['cond', 'r2'], axis=1) \
        .reset_index(),
], axis=0).set_index(['dv', 'pred_var', 'fe', 'kind']).sort_index()

main_stats['disp'] = main_stats.apply(lambda x: format_disp(x['coef'], x['pval'], x['signif'], x['stderr']), axis=1)

main_stats = main_stats.drop(['coef', 'pval', 'tstat', 'signif', 'stderr'], axis=1)

main_stats = main_stats.unstack(3).reset_index()

main_stats.columns = [
    c2 if c2 != '' else c1
    for c1, c2 in main_stats.columns
]

main_stats['dv'] = main_stats['dv'].str.title()

main_stats.loc[main_stats['pred_var'] == 'ws_reltime', 'pred_var'] = '$t_i$'
main_stats.loc[main_stats['pred_var'] == 'ws_reltime_quad', 'pred_var'] = '$q_i^{(c)}$'

main_stats.rename({
    'dv': 'Affect',
    'fe': 'F.E.',
    'pred_var': 'Predictor',
    'decahose': 'Firehose',
    'elite': 'Elite',
    'radio': 'Radio',
}, axis=1, inplace=True)
main_stats.drop('Predictor', axis=1, inplace=True)
main_stats['F.E.'] = main_stats['F.E.'].map({True: 'Y', False: 'N'})
main_stats['Affect'] = main_stats['Affect'].map({'Emotional': 'Emo.', 'Negative': 'Neg.', 'Outraged': 'Out.'})
main_stats = main_stats.set_index([
    'Affect',
    # 'Predictor',
    'F.E.'
])

display(main_stats)
print(main_stats.style.to_latex(
    column_format='ll|r|r|r',
    hrules=True,
    convert_css=True,
))

#### Interaction effects

In [None]:
inter_stats = pd.concat([
    nofe_inter_stats \
        .loc[pd.IndexSlice[:, labels, :]] \
        .assign(fe=False) \
        .drop(['cond', 'r2'], axis=1) \
        .reset_index(),
    
    fe_inter_stats \
        .loc[pd.IndexSlice[:, labels, :]] \
        .assign(fe=True) \
        .drop(['cond', 'r2'], axis=1) \
        .reset_index(),
], axis=0)

inter_stats['pred_var'] = inter_stats.reset_index()['pred_var'].str.replace(r'\[T.(elite|radio)\]', '', regex=True)

inter_stats = inter_stats.set_index(['dv', 'pred_var', 'fe', 'kind']).sort_index()

inter_stats['disp'] = inter_stats.apply(lambda x: format_disp(x['coef'], x['pval'], x['signif'], x['stderr']), axis=1)

inter_stats = inter_stats.drop(['coef', 'pval', 'tstat', 'signif', 'stderr'], axis=1)

inter_stats = inter_stats.unstack(3).reset_index()

inter_stats.columns = [
    c2 if c2 != '' else c1
    for c1, c2 in inter_stats.columns
]

inter_stats['dv'] = inter_stats['dv'].str.title()

inter_stats.loc[inter_stats['pred_var'] == 'ws_reltime:kind', 'pred_var'] = '$t_i G_i^{(c)}$'
inter_stats.loc[inter_stats['pred_var'] == 'ws_reltime_quad:kind', 'pred_var'] = '$q_i^{(c)} G_i^{(c)}$'

inter_stats.rename({
    'dv': 'Affect',
    'fe': 'F.E.',
    'pred_var': 'Predictor',
    ('elite', 'decahose'): 'Elite vs. Firehose',
    ('elite', 'radio'): 'Elite vs. Radio',
    ('radio', 'decahose'): 'Radio vs. Firehose',
}, axis=1, inplace=True)
inter_stats.drop('Predictor', axis=1, inplace=True)
inter_stats['F.E.'] = inter_stats['F.E.'].map({True: 'Y', False: 'N'})
inter_stats['Affect'] = inter_stats['Affect'].map({'Emotional': 'Emo.', 'Negative': 'Neg.', 'Outraged': 'Out.'})
inter_stats = inter_stats.set_index([
    'Affect',
    # 'Predictor',
    'F.E.'
])

display(inter_stats)
print(inter_stats.style.to_latex(
    column_format='ll|r|r|r',
    hrules=True,
    convert_css=True,
))