In [None]:
import os
import re
import gzip
import random
import pickle
import logging

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from statsmodels.distributions.empirical_distribution import ECDF

from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/radio/new-data-processed.csv.gz', 'rt') as f:
    radio_data = pd.read_csv(f, index_col='snippet_id', parse_dates=['timestamp'])
radio_data = radio_data.loc[radio_data['year'] == 2022]

In [None]:
assert radio_data['timestamp'].dtype == np.dtype('<M8[ns]')

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')

assert dat.index.is_unique

dat.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique

comms.shape

In [None]:
dat['group'] = comms['group']
dat = dat.loc[dat['group'].notna(), :]

dat['group'] = dat['group'].astype(int)
dat['year'] = dat['year'].astype(int)

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs.npy', 'rb') as f:
    cdf_vals = np.load(f)

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques.pkl', 'rb') as f:
    all_uniques, all_mean_embs = pickle.load(f)
all_uniques['story_id'] = all_uniques['year'].astype(str) + '-' + all_uniques['kind'] + '-' + all_uniques['group'].astype(str)

In [None]:
stats = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index_col='story_id')

assert stats.index.is_unique
assert (stats.groupby(['year', 'kind']).size() == stats.groupby(['year', 'kind'])['group'].nunique()).all()

In [None]:
selected = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv', index_col='story_id')
assert selected.index.is_unique

covid_threshold = 0.1
selected['decahose_covid_2020'] = (selected['year'] == 2020) & (selected['kind'] == 'decahose') & (selected['covid'] >= covid_threshold)
stats['decahose_covid_2020'] = selected['decahose_covid_2020']
stats['decahose_covid_2020'] = stats['decahose_covid_2020'].fillna(False)

In [None]:
matching = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-matching.csv')

In [None]:
length_mask = (stats['count'] >= 10)

selected_mask = stats.index.isin(selected.index)

matching_mask = stats.index.isin(
    matching['story_id_elite'].tolist() +
    matching['story_id_radio'].tolist() +
    matching.loc[matching['story_id_decahose'].notna(), 'story_id_decahose'].tolist()
)

mask = (
    length_mask
    & (selected_mask | matching_mask)
)

mask.sum(), stats.shape[0]

In [None]:
stats.groupby('year').size()

In [None]:
stats.groupby('kind').size()

In [None]:
stats.groupby(['year', 'kind']).size()

In [None]:
stats.loc[mask].groupby('year').size()

In [None]:
stats.loc[mask].groupby('kind').size()

In [None]:
stats.loc[mask].groupby(['year', 'kind']).size()

In [None]:
stats.loc[mask].groupby(['year', 'kind'])['dur'].mean()

In [None]:
stats.loc[mask, 'start'].hist(by=stats.loc[mask, 'year'])

# Utilities

In [None]:
# cdf_query_end and cdf_query_inc are as also defined in the 5a notebook which calculates the cdfs
def show_example(year, kind, group_ids, cdf_query_end=2*24*3600, cdf_query_inc=60):
    if isinstance(group_ids, int):
        group_ids = [group_ids]
    
    tmp = dat.loc[(dat['year'] == year) & (dat['kind'] == kind) & dat['group'].isin(group_ids), :].copy()
    tmp['reltime'] -= tmp['reltime'].min()

    cdf_query_pts = np.arange(0, cdf_query_end, cdf_query_inc)
    cdf = ECDF(tmp['reltime'])(cdf_query_pts)

    with pd.option_context('display.max_colwidth', 0):
        print('group id(s): ' + ','.join(str(c) for c in group_ids))
        print('number of items: ' + str(tmp.shape[0]))
        display(tmp.sample(min(tmp.shape[0], 10)))
        display(tmp.head(min(tmp.shape[0], 10)))

    fig, axes = plt.subplots(1, 2, figsize=(10, 5))
    tmp.loc[tmp['reltime'] <= tmp['reltime'].min() + cdf_query_end, :].hist('timestamp', ax=axes[0], xrot=45)

    axes[1].plot(cdf_query_pts, cdf)

    axes[0].set_title('Item times')
    axes[1].set_title('ECDF')
    axes[1].set_ylim(0, 1)

    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    axes[1].xaxis.set_major_formatter(fmt)

# 2019

## Decahose

### Example 1

In [None]:
year = 2019
kind = 'decahose'

group_id = 536
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2019
kind = 'decahose'

group_id = 1003
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2019
kind = 'decahose'

group_id = 872
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Elite

### Example 1

In [None]:
year = 2019
kind = 'elite'

group_id = 511
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2019
kind = 'elite'

group_id = 128
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2019
kind = 'elite'

group_id = 15
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Radio

### Example 1

In [None]:
year = 2019
kind = 'radio'

group_id = 100
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2019
kind = 'radio'

group_id = 194
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2019
kind = 'radio'

group_id = 33
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

# 2020

## Decahose

### Example 1

In [None]:
year = 2020
kind = 'decahose'

group_id = 419
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2020
kind = 'decahose'

group_id = 873
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2020
kind = 'decahose'

group_id = 129
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Elite

### Example 1

In [None]:
year = 2020
kind = 'elite'

group_id = 347
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2020
kind = 'elite'

group_id = 314
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2020
kind = 'elite'

group_id = 616
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Radio

### Example 1

In [None]:
year = 2020
kind = 'radio'

group_id = 606
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2020
kind = 'radio'

group_id = 894
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2020
kind = 'radio'

group_id = 551
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

# 2021

## Decahose

### Example 1

In [None]:
year = 2021
kind = 'decahose'

group_id = 1033
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2021
kind = 'decahose'

group_id = 0  # the J6 riot story, fun
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2021
kind = 'decahose'

group_id = 1363
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Elite

### Example 1

In [None]:
year = 2021
kind = 'elite'

group_id = 86
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2021
kind = 'elite'

group_id = 7
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2021
kind = 'elite'

group_id = 53
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

## Radio

### Example 1

In [None]:
year = 2021
kind = 'radio'

group_id = 99
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 2

In [None]:
year = 2021
kind = 'radio'

group_id = 179
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

### Example 3

In [None]:
year = 2021
kind = 'radio'

group_id = 124
# group_id = stats.loc[mask & (stats['kind'] == kind) & (stats['year'] == year), 'group'].sample(1).item()

show_example(year, kind, group_id)

# Inspect matches

## Example 1

In [None]:
i = 4
# i = np.random.randint(0, matching.shape[0])

year = int(matching.iloc[i, :]['year'])
group_elite = int(matching.iloc[i, :]['group_elite'])
group_radio = int(matching.iloc[i, :]['group_radio'])
group_decahose = int(matching.iloc[i, :]['group_decahose'])

print(i, year, group_elite, group_radio, group_decahose)
print(matching.iloc[i, :]['sim_elite_radio'])

In [None]:
show_example(year, 'elite', group_elite)

In [None]:
show_example(year, 'radio', group_radio)

In [None]:
if group_decahose is not None and not np.isnan(group_decahose):
    show_example(year, 'decahose', group_decahose)

## Example 2

In [None]:
i = 32
# i = np.random.randint(0, matching.shape[0])

year = int(matching.iloc[i, :]['year'])
group_elite = int(matching.iloc[i, :]['group_elite'])
group_radio = int(matching.iloc[i, :]['group_radio'])
group_decahose = int(matching.iloc[i, :]['group_decahose'])

print(i, year, group_elite, group_radio, group_decahose)
print(matching.iloc[i, :]['sim_elite_radio'])

In [None]:
show_example(year, 'elite', group_elite)

In [None]:
show_example(year, 'radio', group_radio)

In [None]:
if group_decahose is not None and not np.isnan(group_decahose):
    show_example(year, 'decahose', group_decahose)

## Example 3

In [None]:
i = 13
# i = np.random.randint(0, matching.shape[0])

year = int(matching.iloc[i, :]['year'])
group_elite = int(matching.iloc[i, :]['group_elite'])
group_radio = int(matching.iloc[i, :]['group_radio'])
group_decahose = int(matching.iloc[i, :]['group_decahose'])

print(i, year, group_elite, group_radio, group_decahose)
print(matching.iloc[i, :]['sim_elite_radio'])

In [None]:
show_example(year, 'elite', group_elite)

In [None]:
show_example(year, 'radio', group_radio)

In [None]:
if group_decahose is not None and not np.isnan(group_decahose):
    show_example(year, 'decahose', group_decahose)

# Identify some of the manual events

In [None]:
events = [
    ('bolton', 'John Bolton fired', '2019-09-10T16:00:00+04:00', 'bolton'),
    ('purdue', 'Purdue bankruptcy', '2019-09-16T03:15:00+04:00', 'purdue'),
    ('brady', 'Tom Brady free agent', '2020-03-17T12:45:00+04:00', 'brady'),
    ('gillis', 'Shane Gillis fired', '2019-09-16T20:00:00+04:00', 'gillis'),
    ('manning', 'Manning released', '2020-03-12T21:15:00+04:00', 'manning'),
    ('huffman', 'Huffman sentencing', '2019-09-13T18:00:00+04:00', 'huffman'),
    ('bernie', 'Bernie drops out', '2020-04-08T15:15:00+04:00', 'bernie'),
    ('nba', 'NBA season cancelled', '2020-03-12T01:30:00+04:00', 'nba|(n\.\b\.a)'),
    ('warren', 'Warren drops out', '2020-03-05T15:30:00+04:00', 'warren'),
    ('impeach', 'Trump impeachment', '2019-09-24T18:30:00+04:00', 'impeach'),
]

events = pd.DataFrame(events, columns=['event', 'description', 'timestamp', 'regex']).set_index('event')
events['timestamp'] = pd.to_datetime(events['timestamp'])

events

## Utils

In [None]:
def query_event(event, kind, mask=None, flags=re.I):
    year = events.loc[event, 'timestamp'].year
    term = events.loc[event, 'regex']
    
    if mask is None:
        mask = np.repeat(True, stats.shape[0])
    
    groups = stats.loc[mask & (stats['year'] == year) & (stats['kind'] == kind), 'group'].unique()
    
    ret = {}
    for g in tqdm(groups):
        tmp = dat.loc[(dat['year'] == year) & (dat['kind'] == kind) & (dat['group'] == g)]
        
        size = tmp['content'].shape[0]
        rate = tmp['content'].str.contains(term, flags=flags, regex=True).sum() / size
        
        ret[g] = {
            'group': g,
            'rate': rate,
            'size': size,
            'start': tmp['timestamp'].min(),
            'end': tmp['timestamp'].max(),
        }
    
    return pd.DataFrame(ret).T

def query(targets, window=3*24*60*60):
    if isinstance(targets, str):
        targets = [targets]
    
    inc = pd.Timedelta(window, unit='s')

    rates = {}
    for event in tqdm(target_events):
        if event not in rates.keys():
            rates[event] = {}
        
        start_date = events.loc[event, 'timestamp'] - inc
        end_date = events.loc[event, 'timestamp'] + inc

        for kind in ['radio', 'elite']:
            rr = query_event(event, kind, mask)
            rr = rr.loc[rr['rate'] > 0, :]
            rr = rr.sort_values('rate', ascending=False)
            rr = rr.loc[(rr['start'] > start_date) & (rr['start'] < end_date)]
            
            rates[event][kind] = rr
        
    return rates

## Compute term occurrence rates

In [None]:
target_events = [
    'bolton',
    'impeach',
    'warren'
]

rates = query(target_events)

## John Bolton gets fired

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['bolton']['radio'])

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['bolton']['elite'])

### Radio

In [None]:
show_example(2019, 'radio', [299, 13])

### Twitter

In [None]:
show_example(2019, 'elite', [313, 318, 315, 306, 310, 317, 307, 308, 316, 314, 312])

## Trump impeachment announcement

In the afternoon of 2019-09-24, with days of lead-up and fall-off

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['impeach']['radio'])

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['impeach']['elite'])

### Radio

In [None]:
show_example(2019, 'radio', [176, 298])

### Twitter

In [None]:
show_example(2019, 'elite',
             rates['impeach']['elite'].loc[(rates['impeach']['elite']['rate'] > 0.5)].index.tolist())

## Warren dropping out, Super Tuesday

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['warren']['radio'])

In [None]:
with pd.option_context('display.max_rows', None):
    display(rates['warren']['elite'])

### Radio

In [None]:
show_example(2020, 'radio', 648)

### Twitter

In [None]:
show_example(2020, 'elite', [158, 171, 152, 172])