In [None]:
import os
import math
import logging

import numpy as np
import pandas as pd

from statsmodels.distributions.empirical_distribution import ECDF

import seaborn as sns

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
colors = sns.color_palette('colorblind', 10)

elite_color = colors[3]
radio_color = colors[0]
decahose_color = colors[1]
decahose2_color = colors[2]

# Load data

In [None]:
events = pd.read_csv('data/paper-round-3/metadata/event-terms.csv', parse_dates=['timestamp'])
events['date'] = events['timestamp'].dt.date

event_dates = events.groupby('event')['date'].max()

radio_ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-radio.csv')
radio_ticks['timestamp'] = pd.to_datetime(radio_ticks['timestamp'])

elite_ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-elite.csv')
elite_ticks['timestamp'] = pd.to_datetime(elite_ticks['timestamp'])

decahose_ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-decahose.csv')
decahose_ticks['timestamp'] = pd.to_datetime(decahose_ticks['timestamp'])

radio_ticks_overall = radio_ticks.loc[
    radio_ticks['is_public'].isna() &
    radio_ticks['station_census_region'].isna() &
    radio_ticks['am_band'].isna() &
    radio_ticks['syndicated'].isna(),
:] \
    .drop(['is_public', 'station_census_region', 'am_band', 'syndicated'], axis=1) \

elite_ticks_overall = elite_ticks.loc[
    elite_ticks['is_retweet'].isna() &
    elite_ticks['conservative'].isna(),
:] \
    .drop(['is_retweet', 'conservative'], axis=1) \

decahose_ticks_overall = decahose_ticks.loc[
    decahose_ticks['is_retweet'].isna(),
:] \
    .drop(['is_retweet'], axis=1) \

assert radio_ticks_overall.isna().sum().sum() == 0
assert elite_ticks_overall.isna().sum().sum() == 0
assert decahose_ticks_overall.isna().sum().sum() == 0

In [None]:
event_cols = list(
    set(c for c in list(radio_ticks) if c.startswith('event_')) &
    set(c for c in list(elite_ticks) if c.startswith('event_')) &
    set(c for c in list(decahose_ticks) if c.startswith('event_'))
)

In [None]:
radio_ticks_overall = radio_ticks_overall.set_index(['freq', 'timestamp'])
elite_ticks_overall = elite_ticks_overall.set_index(['freq', 'timestamp'])
decahose_ticks_overall = decahose_ticks_overall.set_index(['freq', 'timestamp'])

# Summarize events

In [None]:
tmp = events.copy().set_index('event')
tmp['timestamp'] = tmp['timestamp'].dt.tz_localize(None)
tmp['time'] = tmp['timestamp'].dt.time
tmp = tmp.drop('timestamp', axis=1)
tmp.index = tmp.index.to_series().apply(lambda s: s.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))

tmp.index.name = 'Event'
tmp.columns = [s.title() for s in tmp.columns]

with pd.option_context('display.float_format', lambda x: '%.3f' % x, 'display.max_rows', None):
    display(tmp)

In [None]:
print(tmp.style \
    .to_latex(
        hrules = True,
        column_format = 'l|r|r',
        position = 'ht',
        label = 'tab:manual_events',
        position_float = 'centering',
        environment = 'table',
        convert_css = True,
    )
)

# Visualize raw counts

## 15min

In [None]:
for event in event_cols:
    period = '15min'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
        decahose_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('decahose'),
    ], axis=1)
    
    tmp = tmp.reset_index().drop('freq', axis=1)
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
    axes = axes.flatten()
    
    axes[0].set_title('Radio')
    axes[1].set_title('Elite')
    axes[2].set_title('Decahose')
    
    tmp['radio'].plot(ax=axes[0], rot=45, label='radio', color=radio_color)    
    tmp['elite'].plot(ax=axes[1], rot=45, label='elite', color=elite_color)
    tmp['decahose'].plot(ax=axes[2], rot=45, label='decahose', color=decahose_color)
    
    axes[0].axvline(timestamp, color='red')
    axes[1].axvline(timestamp, color='red')
    axes[2].axvline(timestamp, color='red')
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    fig.tight_layout()

## 6 hours

In [None]:
for event in event_cols:
    period = '6H'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
        decahose_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('decahose'),
    ], axis=1)
    
    tmp = tmp.reset_index().drop('freq', axis=1)
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
    axes = axes.flatten()
    
    axes[0].set_title('Radio')
    axes[1].set_title('Elite')
    axes[2].set_title('Decahose')
    
    tmp['radio'].plot(ax=axes[0], rot=45, label='radio', color=radio_color)    
    tmp['elite'].plot(ax=axes[1], rot=45, label='elite', color=elite_color)
    tmp['decahose'].plot(ax=axes[2], rot=45, label='decahose', color=decahose_color)
    
    axes[0].axvline(timestamp, color='red')
    axes[1].axvline(timestamp, color='red')
    axes[2].axvline(timestamp, color='red')
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    fig.tight_layout()

## 1 day

In [None]:
for event in event_cols:
    period = '1D'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
        decahose_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('decahose'),
    ], axis=1)
    
    tmp = tmp.reset_index().drop('freq', axis=1)
    
    timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
    date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()
    
    tmp = tmp.loc[
        (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('6h')) &
        (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('3d')),
    :]
    
    tmp = tmp.set_index('timestamp')
    
    fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
    axes = axes.flatten()
    
    axes[0].set_title('Radio')
    axes[1].set_title('Elite')
    axes[2].set_title('Decahose')
    
    tmp['radio'].plot(ax=axes[0], rot=45, label='radio', color=radio_color)
    tmp['elite'].plot(ax=axes[1], rot=45, label='elite', color=elite_color)
    tmp['decahose'].plot(ax=axes[2], rot=45, label='decahose', color=decahose_color)
    
    axes[0].axvline(timestamp, color='red')
    axes[1].axvline(timestamp, color='red')
    axes[2].axvline(timestamp, color='red')
    
    fig.suptitle(event.replace('event_', '').replace('_', ' ').title().replace('Nba', 'NBA'))
    
    fig.tight_layout()

## For paper

In [None]:
event = 'event_bernie_drops_out'
period = '15min'

tmp = pd.concat([
    radio_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('radio'),
    elite_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('elite'),
    decahose_ticks_overall.loc[pd.IndexSlice[period, :], event].rename('decahose'),
], axis=1)

tmp = tmp.reset_index().drop('freq', axis=1)

timestamp = events.loc[events['event'] == event.replace('event_', ''), 'timestamp'].item()
date = events.loc[events['event'] == event.replace('event_', ''), 'date'].item()

tmp = tmp.loc[
    (tmp['timestamp'] >= pd.Timestamp(timestamp) - pd.Timedelta('3h')) &
    (tmp['timestamp'] <= pd.Timestamp(timestamp) + pd.Timedelta('2d')),
:]

tmp = tmp.set_index('timestamp')

fig, axes = plt.subplots(1, 3, figsize=(15, 5), sharex=True)
axes = axes.flatten()

axes[0].set_title('Elite', fontsize=14)
axes[1].set_title('Firehose', fontsize=14)
axes[2].set_title('Radio', fontsize=14)

tmp['elite'].plot(ax=axes[0], rot=45, label='elite', color=elite_color)
tmp['decahose'].plot(ax=axes[1], rot=45, label='firehose', color=decahose2_color)
tmp['radio'].plot(ax=axes[2], rot=45, label='radio', color=radio_color)

axes[0].axvline(timestamp, 0, 1, color='red', linewidth=1, linestyle='--')
axes[1].axvline(timestamp, 0, 1, color='red', linewidth=1, linestyle='--')
axes[2].axvline(timestamp, 0, 1, color='red', linewidth=1, linestyle='--')

axes[0].set_xlabel('Time', fontsize=12)
axes[1].set_xlabel('Time', fontsize=12)
axes[2].set_xlabel('Time', fontsize=12)

axes[0].set_ylabel('Count', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[2].set_ylabel('Count', fontsize=12)

axes[0].set_ylim(-0.05 * tmp['elite'].max(), 1.25 * tmp['elite'].max())
axes[1].set_ylim(-0.05 * tmp['decahose'].max(), 1.25 * tmp['decahose'].max())
axes[2].set_ylim(-0.05 * tmp['radio'].max(), 1.25 * tmp['radio'].max())

fig.tight_layout()

# Summary stats

## Utils

In [None]:
def repeat_reltime_row(row):
    return [row['reltime'] for _ in range(row['cnt'])]

def repeat_reltime(df):
    vals = df.apply(repeat_reltime_row, axis=1).tolist()
    vals = [x for row in vals for x in row]
    vals = pd.Series(vals)
    
    return vals

In [None]:
def avg(df):
    reltimes = repeat_reltime(df)
    return reltimes.mean()

def std(df):
    reltimes = repeat_reltime(df)    
    return reltimes.std()

## Compute stats

In [None]:
out = []
for event in event_cols:
    event = event.replace('event_', '')
    
    focal_dt = events.loc[events['event'] == event, 'timestamp'].item()
    start_dt = focal_dt
    end_dt = start_dt + pd.Timedelta('4d')
    
    period = '15min'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('elite'),
        decahose_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('decahose'),
    ], axis=1)
    
    radio_rt = tmp.loc[tmp['radio'].notna() & (tmp['radio'] > 0), :].reset_index()[['radio', 'timestamp']].rename({'radio': 'cnt'}, axis=1)
    radio_rt = radio_rt.loc[(radio_rt['timestamp'] >= start_dt) & (radio_rt['timestamp'] <= end_dt), :]
    radio_rt['reltime'] = (radio_rt['timestamp'] - start_dt).dt.total_seconds()
    radio_rt['cnt'] = radio_rt['cnt'].astype(int)
    
    elite_rt = tmp.loc[tmp['elite'].notna() & (tmp['elite'] > 0), :].reset_index()[['elite', 'timestamp']].rename({'elite': 'cnt'}, axis=1)
    elite_rt = elite_rt.loc[(elite_rt['timestamp'] >= start_dt) & (elite_rt['timestamp'] <= end_dt), :]
    elite_rt['reltime'] = (elite_rt['timestamp'] - start_dt).dt.total_seconds()
    elite_rt['cnt'] = elite_rt['cnt'].astype(int)
    
    decahose_rt = tmp.loc[tmp['decahose'].notna() & (tmp['decahose'] > 0), :].reset_index()[['decahose', 'timestamp']].rename({'decahose': 'cnt'}, axis=1)
    decahose_rt = decahose_rt.loc[(decahose_rt['timestamp'] >= start_dt) & (decahose_rt['timestamp'] <= end_dt), :]
    decahose_rt['reltime'] = (decahose_rt['timestamp'] - start_dt).dt.total_seconds()
    decahose_rt['cnt'] = decahose_rt['cnt'].astype(int)
    
    out += [{
        'event': event,
        'metric': 'count',
        'elite': elite_rt['cnt'].sum(),
        'radio': radio_rt['cnt'].sum(),
        'decahose': decahose_rt['cnt'].sum(),
    }]
    
    out += [{
        'event': event,
        'metric': 'avg',
        'elite': avg(elite_rt),
        'radio': avg(radio_rt),
        'decahose': avg(decahose_rt),
    }]
    
    out += [{
        'event': event,
        'metric': 'std',
        'elite': std(elite_rt),
        'radio': std(radio_rt),
        'decahose': std(decahose_rt),
    }]

out = pd.DataFrame(out)
out['radio_minus_elite'] = out['radio'] - out['elite']
out['radio_minus_decahose'] = out['radio'] - out['decahose']

out.sort_values('metric')

## Mean reltime

In [None]:
tmp = out.loc[out['metric'] == 'avg', :].drop('metric', axis=1).set_index('event')

tmp.round(0).astype(int)

In [None]:
tmp['elite'].mean(), tmp['radio'].mean(), tmp['radio_minus_elite'].mean() / tmp['elite'].mean()

In [None]:
tmp['decahose'].mean(), tmp['radio'].mean(), tmp['radio_minus_decahose'].mean() / tmp['decahose'].mean()

## SD reltime

In [None]:
tmp = out.loc[out['metric'] == 'std', :].drop('metric', axis=1).set_index('event')

tmp.round(0).astype(int)

In [None]:
tmp['elite'].mean(), tmp['radio'].mean(), tmp['radio_minus_elite'].mean() / tmp['elite'].mean()

In [None]:
tmp['decahose'].mean(), tmp['radio'].mean(), tmp['radio_minus_decahose'].mean() / tmp['decahose'].mean()

# Plot the average empirical cdf/pdf

In [None]:
# in seconds
# doesn't make sense to go lower than 15 mins because of tick frequency
cdf_query_end = 48 * 3600
cdf_query_inc = 15 * 60

cdf_query_pts = np.arange(0, cdf_query_end, cdf_query_inc)

tw_vals, rd_vals, dh_vals = [], [], []
for event in event_cols:
    event = event.replace('event_', '')
    
    focal_dt = events.loc[events['event'] == event, 'timestamp'].item()
    start_dt = focal_dt
    end_dt = start_dt + pd.Timedelta(days=2)
    
    period = '15min'
    
    tmp = pd.concat([
        radio_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('radio'),
        elite_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('elite'),
        decahose_ticks_overall.loc[pd.IndexSlice[period, :], 'event_' + event].rename('decahose'),
    ], axis=1)
    
    radio_rt = tmp.loc[tmp['radio'].notna() & (tmp['radio'] > 0), :].reset_index()[['radio', 'timestamp']].rename({'radio': 'cnt'}, axis=1)
    radio_rt = radio_rt.loc[(radio_rt['timestamp'] >= start_dt) & (radio_rt['timestamp'] <= end_dt), :]
    radio_rt['reltime'] = (radio_rt['timestamp'] - start_dt).dt.total_seconds()
    radio_rt['cnt'] = radio_rt['cnt'].astype(int)
    
    elite_rt = tmp.loc[tmp['elite'].notna() & (tmp['elite'] > 0), :].reset_index()[['elite', 'timestamp']].rename({'elite': 'cnt'}, axis=1)
    elite_rt = elite_rt.loc[(elite_rt['timestamp'] >= start_dt) & (elite_rt['timestamp'] <= end_dt), :]
    elite_rt['reltime'] = (elite_rt['timestamp'] - start_dt).dt.total_seconds()    
    elite_rt['cnt'] = elite_rt['cnt'].astype(int)

    decahose_rt = tmp.loc[tmp['decahose'].notna() & (tmp['decahose'] > 0), :].reset_index()[['decahose', 'timestamp']].rename({'decahose': 'cnt'}, axis=1)
    decahose_rt = decahose_rt.loc[(decahose_rt['timestamp'] >= start_dt) & (decahose_rt['timestamp'] <= end_dt), :]
    decahose_rt['reltime'] = (decahose_rt['timestamp'] - start_dt).dt.total_seconds()
    decahose_rt['cnt'] = decahose_rt['cnt'].astype(int)
    
    tw = repeat_reltime(elite_rt)
    tw_vals += [ECDF(tw)(cdf_query_pts)]

    rd = repeat_reltime(radio_rt)
    rd_vals += [ECDF(rd)(cdf_query_pts)]

    dh = repeat_reltime(decahose_rt)
    dh_vals += [ECDF(dh)(cdf_query_pts)]
    
tw_vals = np.stack(tw_vals)
rd_vals = np.stack(rd_vals)
dh_vals = np.stack(dh_vals)

assert tw_vals.shape == rd_vals.shape
assert tw_vals.shape == dh_vals.shape

In [None]:
# in units of cdf_query_inc
plot_max = 96
plot_interval = 1

cdf = pd.DataFrame([
    tw_vals[:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[:, 0:plot_max:plot_interval].mean(axis=0),
    dh_vals[:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio', 'decahose']).T
cdf.index = (cdf.index.to_series() * cdf_query_inc * plot_interval)

pdf = cdf.copy()
pdf['elite'] = np.gradient(pdf['elite'])
pdf['radio'] = np.gradient(pdf['radio'])
pdf['decahose'] = np.gradient(pdf['decahose'])

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharex=True)
cdf = cdf.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose'}, axis=1)
pdf = pdf.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose'}, axis=1)

cdf['Elite'].plot(ax=axes[0], color=elite_color, linestyle='solid')
cdf['Firehose'].plot(ax=axes[0], color=decahose2_color, linestyle='dashdot')
cdf['Radio'].plot(ax=axes[0], color=radio_color, linestyle='dotted')

pdf['Elite'].plot(ax=axes[1], color=elite_color, linestyle='solid')
pdf['Firehose'].plot(ax=axes[1], color=decahose2_color, linestyle='dashdot')
pdf['Radio'].plot(ax=axes[1], color=radio_color, linestyle='dotted')

axes[0].set_title('Pooled Empirical CDF')
axes[1].set_title('Pooled Empirical PDF')

axes[0].set_xlabel('Time')
axes[1].set_xlabel('Time')

axes[0].set_ylabel('Proportion')
axes[1].set_ylabel('Density')

axes[0].set_ylim(0, 1)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)
axes[0].xaxis.set_major_formatter(fmt)

axes[1].legend()
fig.tight_layout()

print(f'Based on {tw_vals.shape[0]} event(s)')

In [None]:
# in units of cdf_query_inc
plot_max = 192
plot_interval = 1

cdf = pd.DataFrame([
    tw_vals[:, 0:plot_max:plot_interval].mean(axis=0),
    rd_vals[:, 0:plot_max:plot_interval].mean(axis=0),
    dh_vals[:, 0:plot_max:plot_interval].mean(axis=0),
], index=['elite', 'radio', 'decahose']).T
cdf.index = (cdf.index.to_series() * cdf_query_inc * plot_interval)

pdf = cdf.copy()
pdf['elite'] = np.gradient(pdf['elite'])
pdf['radio'] = np.gradient(pdf['radio'])
pdf['decahose'] = np.gradient(pdf['decahose'])

fig, axes = plt.subplots(1, 2, figsize=(10, 5), sharex=True)
cdf = cdf.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose'}, axis=1)
pdf = pdf.rename({'elite': 'Elite', 'radio': 'Radio', 'decahose': 'Firehose'}, axis=1)

cdf['Elite'].plot(ax=axes[0], color=elite_color, linestyle='solid')
cdf['Firehose'].plot(ax=axes[0], color=decahose2_color, linestyle='dashdot')
cdf['Radio'].plot(ax=axes[0], color=radio_color, linestyle='dotted')

pdf['Elite'].plot(ax=axes[1], color=elite_color, linestyle='solid')
pdf['Firehose'].plot(ax=axes[1], color=decahose2_color, linestyle='dashdot')
pdf['Radio'].plot(ax=axes[1], color=radio_color, linestyle='dotted')

axes[0].set_title('Pooled Empirical CDF')
axes[1].set_title('Pooled Empirical PDF')

axes[0].set_xlabel('Time')
axes[1].set_xlabel('Time')

axes[0].set_ylabel('Proportion')
axes[1].set_ylabel('Density')

axes[0].set_ylim(0, 1)

def seconds_to_hours(x, pos):
    return f'{x / 3600:.0f}h'
fmt = mp.ticker.FuncFormatter(seconds_to_hours)
axes[0].xaxis.set_major_formatter(fmt)

axes[1].legend()
fig.tight_layout()

print(f'Based on {tw_vals.shape[0]} event(s)')