In [None]:
import os
import math
import logging

import numpy as np
import pandas as pd

from statsmodels.distributions.empirical_distribution import ECDF

import seaborn as sns
import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
colors = sns.color_palette('bright', 10)

elite_color = colors[3]
radio_color = colors[0]
decahose_color = colors[9]
decahose2_color = colors[4]

# Load data

In [None]:
events = pd.read_csv('data/paper-round-3/metadata/event-terms.csv', parse_dates=['timestamp'])
events['date'] = events['timestamp'].dt.date

radio_ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-radio.csv')
radio_ticks['timestamp'] = pd.to_datetime(radio_ticks['timestamp'])

elite_ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-elite.csv')
elite_ticks['timestamp'] = pd.to_datetime(elite_ticks['timestamp'])

radio_ticks_overall = radio_ticks.loc[
    ~radio_ticks['is_public'].isna(),
:] \
    .drop(['station_census_region', 'am_band', 'syndicated'], axis=1) \

elite_ticks_overall = elite_ticks.loc[
    ~elite_ticks['conservative'].isna(),
:] \
    .drop('is_retweet', axis=1) \

radio_ticks_overall['is_public'] = radio_ticks_overall['is_public'].apply(lambda s: 1 if s == 0 else 0)
radio_ticks_overall = radio_ticks_overall.rename({'is_public': 'conservative'}, axis=1)

radio_ticks_overall['conservative'] = radio_ticks_overall['conservative'].astype(int)
elite_ticks_overall['conservative'] = elite_ticks_overall['conservative'].astype(int)

assert radio_ticks_overall.isna().sum().sum() == 0
assert elite_ticks_overall.isna().sum().sum() == 0

In [None]:
event_cols = list(
    set(c for c in list(radio_ticks) if c.startswith('event_')) &
    set(c for c in list(elite_ticks) if c.startswith('event_'))
)

In [None]:
radio_ticks_overall = radio_ticks_overall.set_index(['freq', 'timestamp', 'conservative'])
elite_ticks_overall = elite_ticks_overall.set_index(['freq', 'timestamp', 'conservative'])

# Visualize raw counts

## 15min

In [None]:
for event in event_cols:
    period = '15min'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
    ], axis=1)
    
    tmp = tmp.reset_index().pivot(
        index=['freq', 'timestamp'],
        columns='conservative',
        values=['radio', 'elite']
    )

    assert tmp.index.get_level_values(0).unique().shape[0] == 1
    tmp.index = tmp.index.droplevel(0)
    tmp = tmp.reset_index()
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10), sharex=True)
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    for ax in axes.flatten():
        ax.axvline(timestamp, color='red')
        
    axes[0][0].set_title('Radio / Liberal')
    axes[1][0].set_title('Elite / Liberal')
    axes[0][1].set_title('Radio / Conservative')
    axes[1][1].set_title('Elite / Conservative')
    
    tmp['radio'][0].plot(ax=axes[0][0], rot=45, label='radio', color=radio_color)    
    tmp['elite'][0].plot(ax=axes[1][0], rot=45, label='elite', color=elite_color)
    tmp['radio'][1].plot(ax=axes[0][1], rot=45, label='radio', color=radio_color)
    tmp['elite'][1].plot(ax=axes[1][1], rot=45, label='elite', color=elite_color)
    
    fig.tight_layout()

## 6 hours

In [None]:
for event in event_cols:
    period = '6H'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
    ], axis=1)
    
    tmp = tmp.reset_index().pivot(
        index=['freq', 'timestamp'],
        columns='conservative',
        values=['radio', 'elite']
    )
    
    assert tmp.index.get_level_values(0).unique().shape[0] == 1
    tmp.index = tmp.index.droplevel(0)
    tmp = tmp.reset_index()
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10), sharex=True)
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    for ax in axes.flatten():
        ax.axvline(timestamp, color='red')
        
    axes[0][0].set_title('Radio / Liberal')
    axes[1][0].set_title('Elite / Liberal')
    axes[0][1].set_title('Radio / Conservative')
    axes[1][1].set_title('Elite / Conservative')
    
    tmp['radio'][0].plot(ax=axes[0][0], rot=45, label='radio', color=radio_color)    
    tmp['elite'][0].plot(ax=axes[1][0], rot=45, label='elite', color=elite_color)
    tmp['radio'][1].plot(ax=axes[0][1], rot=45, label='radio', color=radio_color)
    tmp['elite'][1].plot(ax=axes[1][1], rot=45, label='elite', color=elite_color)
    
    fig.tight_layout()

## 1 day

In [None]:
for event in event_cols:
    period = '1D'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
    ], axis=1)
    
    tmp = tmp.reset_index().pivot(
        index=['freq', 'timestamp'],
        columns='conservative',
        values=['radio', 'elite']
    )    

    assert tmp.index.get_level_values(0).unique().shape[0] == 1
    tmp.index = tmp.index.droplevel(0)
    tmp = tmp.reset_index()
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(2, 2, figsize=(12, 10), sharex=True)
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    for ax in axes.flatten():
        ax.axvline(timestamp, color='red')
        
    axes[0][0].set_title('Radio / Liberal')
    axes[1][0].set_title('Elite / Liberal')
    axes[0][1].set_title('Radio / Conservative')
    axes[1][1].set_title('Elite / Conservative')
    
    tmp['radio'][0].plot(ax=axes[0][0], rot=45, label='radio', color=radio_color)
    tmp['elite'][0].plot(ax=axes[1][0], rot=45, label='elite', color=elite_color)
    tmp['radio'][1].plot(ax=axes[0][1], rot=45, label='radio', color=radio_color)
    tmp['elite'][1].plot(ax=axes[1][1], rot=45, label='elite', color=elite_color)
    
    fig.tight_layout()

# Summary stats

## Utils

In [None]:
def repeat_reltime_row(row):
    return [row['reltime'] for _ in range(row['cnt'])]

def repeat_reltime(df):
    vals = df.apply(repeat_reltime_row, axis=1).tolist()
    vals = [x for row in vals for x in row]
    vals = pd.Series(vals)
    
    return vals

In [None]:
def iqr(df):
    reltimes = repeat_reltime(df)
    return np.percentile(reltimes, 75) - np.percentile(reltimes, 25)

def avg(df):
    reltimes = repeat_reltime(df)
    return reltimes.mean()

def std(df):
    reltimes = repeat_reltime(df)    
    return reltimes.std()

## Compute stats

In [None]:
out = []
for event in event_cols:
    period = '15min'
    event = event.replace('event_', '')
    
    focal_dt = events.loc[events['event'] == event, 'timestamp'].item()
    start_dt = focal_dt
    end_dt = start_dt + pd.Timedelta('4d')

    for conservative in (0, 1):
        tmp = pd.concat([
            radio_ticks_overall.loc[pd.IndexSlice[period, :, conservative], 'event_' + event].rename('radio'),
            elite_ticks_overall.loc[pd.IndexSlice[period, :, conservative], 'event_' + event].rename('elite'),
        ], axis=1)

        radio_rt = tmp.loc[tmp['radio'] > 0, :].reset_index()[['radio', 'timestamp']].rename({'radio': 'cnt'}, axis=1)
        radio_rt = radio_rt.loc[(radio_rt['timestamp'] >= start_dt) & (radio_rt['timestamp'] <= end_dt), :]
        radio_rt['reltime'] = (radio_rt['timestamp'] - start_dt).dt.total_seconds()
        assert radio_rt['cnt'].isna().sum() == 0
        radio_rt['cnt'] = radio_rt['cnt'].astype(int)

        elite_rt = tmp.loc[tmp['elite'] > 0, :].reset_index()[['elite', 'timestamp']].rename({'elite': 'cnt'}, axis=1)
        elite_rt = elite_rt.loc[(elite_rt['timestamp'] >= start_dt) & (elite_rt['timestamp'] <= end_dt), :]
        elite_rt['reltime'] = (elite_rt['timestamp'] - start_dt).dt.total_seconds()
        assert elite_rt['cnt'].isna().sum() == 0
        elite_rt['cnt'] = elite_rt['cnt'].astype(int)

        out += [{'conservative': conservative, 'event': event, 'metric': 'count', 'elite': elite_rt['cnt'].sum(), 'radio': radio_rt['cnt'].sum()}]
        out += [{'conservative': conservative, 'event': event, 'metric': 'avg', 'elite': avg(elite_rt), 'radio': avg(radio_rt)}]
        out += [{'conservative': conservative, 'event': event, 'metric': 'std', 'elite': std(elite_rt), 'radio': std(radio_rt)}]

out = pd.DataFrame(out)
out['radio_minus_elite'] = out['radio'] - out['elite']

out.sort_values(['metric', 'conservative'])

## Mean reltime

In [None]:
tmp = out.loc[out['metric'] == 'avg', :].drop('metric', axis=1).set_index(['event', 'conservative']).sort_index().round(0).astype(int)

tmp.round(0).astype(int)

In [None]:
tmp['elite'].mean(), tmp['radio'].mean(), tmp['radio_minus_elite'].mean() / tmp['elite'].mean()

In [None]:
tmp.groupby('conservative').apply(lambda s: pd.Series((s['elite'].mean(), s['radio'].mean(), s['radio_minus_elite'].mean() / s['elite'].mean()), index=['elite', 'radio', 'radio_minus_elite']))

## SD reltime

In [None]:
tmp = out.loc[out['metric'] == 'std', :].drop('metric', axis=1).set_index(['event', 'conservative']).sort_index().round(0).astype(int)

tmp.round(0).astype(int)

In [None]:
tmp['elite'].mean(), tmp['radio'].mean(), tmp['radio_minus_elite'].mean() / tmp['elite'].mean()

In [None]:
tmp.groupby('conservative').apply(lambda s: pd.Series((s['elite'].mean(), s['radio'].mean(), s['radio_minus_elite'].mean() / s['elite'].mean()), index=['elite', 'radio', 'radio_minus_elite']))

# Plot the average empirical cdf/pdf

In [None]:
# in seconds
# doesn't make sense to go lower than 15 mins because of tick frequency
cdf_query_end = 48 * 3600
cdf_query_inc = 15 * 60

cdf_query_pts = np.arange(0, cdf_query_end, cdf_query_inc)

tw_vals = {0: [], 1: []}
rd_vals = {0: [], 1: []}
for event in event_cols:
    period = '15min'
    event = event.replace('event_', '')
    
    focal_dt = events.loc[events['event'] == event, 'timestamp'].item()
    start_dt = focal_dt
    end_dt = start_dt + pd.Timedelta('2d')

    for conservative in (0, 1):
        tmp = pd.concat([
            radio_ticks_overall.loc[pd.IndexSlice[period, :, conservative], 'event_' + event].rename('radio'),
            elite_ticks_overall.loc[pd.IndexSlice[period, :, conservative], 'event_' + event].rename('elite'),
        ], axis=1)

        radio_rt = tmp.loc[tmp['radio'] > 0, :].reset_index()[['radio', 'timestamp']].rename({'radio': 'cnt'}, axis=1)
        radio_rt = radio_rt.loc[(radio_rt['timestamp'] >= start_dt) & (radio_rt['timestamp'] <= end_dt), :]
        radio_rt['reltime'] = (radio_rt['timestamp'] - start_dt).dt.total_seconds()
        assert radio_rt['cnt'].isna().sum() == 0
        radio_rt['cnt'] = radio_rt['cnt'].astype(int)

        elite_rt = tmp.loc[tmp['elite'] > 0, :].reset_index()[['elite', 'timestamp']].rename({'elite': 'cnt'}, axis=1)
        elite_rt = elite_rt.loc[(elite_rt['timestamp'] >= start_dt) & (elite_rt['timestamp'] <= end_dt), :]
        elite_rt['reltime'] = (elite_rt['timestamp'] - start_dt).dt.total_seconds()
        assert elite_rt['cnt'].isna().sum() == 0
        elite_rt['cnt'] = elite_rt['cnt'].astype(int)

        tw = repeat_reltime(elite_rt)
        tw_vals[conservative] += [ECDF(tw)(cdf_query_pts)]

        rd = repeat_reltime(radio_rt)
        rd_vals[conservative] += [ECDF(rd)(cdf_query_pts)]

tw_vals[0] = np.stack(tw_vals[0])
tw_vals[1] = np.stack(tw_vals[1])
rd_vals[0] = np.stack(rd_vals[0])
rd_vals[1] = np.stack(rd_vals[1])

assert tw_vals[0].shape == rd_vals[0].shape
assert tw_vals[1].shape == rd_vals[1].shape

In [None]:
# in units of cdf_query_inc
plot_max = 96
plot_interval = 1

cdf_lib = pd.DataFrame([
    tw_vals[0][:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[0][:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio']).T
cdf_lib.index = (cdf_lib.index.to_series() * cdf_query_inc * plot_interval)

pdf_lib = cdf_lib.copy()
pdf_lib['elite'] = np.gradient(pdf_lib['elite'])
pdf_lib['radio'] = np.gradient(pdf_lib['radio'])

cdf_con = pd.DataFrame([
    tw_vals[1][:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[1][:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio']).T
cdf_con.index = (cdf_con.index.to_series() * cdf_query_inc * plot_interval)

pdf_con = cdf_con.copy()
pdf_con['elite'] = np.gradient(pdf_con['elite'])
pdf_con['radio'] = np.gradient(pdf_con['radio'])

fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)

cdf_lib.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
pdf_lib.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
cdf_con.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
pdf_con.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)

cdf_lib['Elite'].plot(ax=axes[0][0], color=elite_color)
pdf_lib['Elite'].plot(ax=axes[0][1], color=elite_color)
cdf_con['Elite'].plot(ax=axes[1][0], color=elite_color)
pdf_con['Elite'].plot(ax=axes[1][1], color=elite_color)

cdf_lib['Radio'].plot(ax=axes[0][0], color=radio_color)
pdf_lib['Radio'].plot(ax=axes[0][1], color=radio_color)
cdf_con['Radio'].plot(ax=axes[1][0], color=radio_color)
pdf_con['Radio'].plot(ax=axes[1][1], color=radio_color)

axes[0][0].set_title('Liberal: Pooled Empirical CDF')
axes[0][1].set_title('Liberal: Pooled Empirical PDF')
axes[1][0].set_title('Conservative: Pooled Empirical CDF')
axes[1][1].set_title('Conservative: Pooled Empirical PDF')

axes[0][0].set_xlabel('Time')
axes[0][1].set_xlabel('Time')
axes[1][0].set_xlabel('Time')
axes[1][1].set_xlabel('Time')

axes[0][0].set_ylabel('Proportion')
axes[0][1].set_ylabel('Density')
axes[1][0].set_ylabel('Proportion')
axes[1][1].set_ylabel('Density')

axes[0][0].set_ylim(0, 1)
axes[1][0].set_ylim(0, 1)

fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
for ax in axes.flatten():
    ax.xaxis.set_major_formatter(fmt)

fig.tight_layout()

print(f'Based on {tw_vals[0].shape[0]} event(s)')

In [None]:
# in units of cdf_query_inc
plot_max = 192
plot_interval = 1

cdf_lib = pd.DataFrame([
    tw_vals[0][:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[0][:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio']).T
cdf_lib.index = (cdf_lib.index.to_series() * cdf_query_inc * plot_interval)

pdf_lib = cdf_lib.copy()
pdf_lib['elite'] = np.gradient(pdf_lib['elite'])
pdf_lib['radio'] = np.gradient(pdf_lib['radio'])

cdf_con = pd.DataFrame([
    tw_vals[1][:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[1][:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio']).T
cdf_con.index = (cdf_con.index.to_series() * cdf_query_inc * plot_interval)

pdf_con = cdf_con.copy()
pdf_con['elite'] = np.gradient(pdf_con['elite'])
pdf_con['radio'] = np.gradient(pdf_con['radio'])

fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True)

cdf_lib.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
pdf_lib.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
cdf_con.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)
pdf_con.rename({'elite': 'Elite', 'radio': 'Radio'}, axis=1, inplace=True)

cdf_lib['Elite'].plot(ax=axes[0][0], color=elite_color)
pdf_lib['Elite'].plot(ax=axes[0][1], color=elite_color)
cdf_con['Elite'].plot(ax=axes[1][0], color=elite_color)
pdf_con['Elite'].plot(ax=axes[1][1], color=elite_color)

cdf_lib['Radio'].plot(ax=axes[0][0], color=radio_color)
pdf_lib['Radio'].plot(ax=axes[0][1], color=radio_color)
cdf_con['Radio'].plot(ax=axes[1][0], color=radio_color)
pdf_con['Radio'].plot(ax=axes[1][1], color=radio_color)

axes[0][0].set_title('Liberal: Pooled Empirical CDF')
axes[0][1].set_title('Liberal: Pooled Empirical PDF')
axes[1][0].set_title('Conservative: Pooled Empirical CDF')
axes[1][1].set_title('Conservative: Pooled Empirical PDF')

axes[0][0].set_xlabel('Time')
axes[0][1].set_xlabel('Time')
axes[1][0].set_xlabel('Time')
axes[1][1].set_xlabel('Time')

axes[0][0].set_ylabel('Proportion')
axes[0][1].set_ylabel('Density')
axes[1][0].set_ylabel('Proportion')
axes[1][1].set_ylabel('Density')

axes[0][0].set_ylim(0, 1)
axes[1][0].set_ylim(0, 1)

fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
for ax in axes.flatten():
    ax.xaxis.set_major_formatter(fmt)

fig.tight_layout()

print(f'Based on {tw_vals[0].shape[0]} event(s)')