In [None]:
import os
import gzip
import random
import pickle
import logging

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sentence_transformers as st

from statsmodels.distributions.empirical_distribution import ECDF

from IPython.display import display
from tqdm.notebook import tqdm, trange

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')

assert dat.index.is_unique

dat.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique
assert comms['year'].isna().sum() == 0

comms.shape

In [None]:
dat['group'] = comms['group']
has_group_mask = dat['group'].notna()
dat = dat.loc[has_group_mask, :]

dat['group'] = dat['group'].astype(int)
dat['year'] = dat['year'].astype(int)

# Average embeddings by community

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-embeds.npy', 'rb') as f:
    embs = np.load(f)
embs = embs[has_group_mask, :]

all_uniques, all_mean_embs = [], []
for year in tqdm(dat['year'].unique()):
    for kind in tqdm(dat.loc[dat['year'] == year, 'kind'].unique()):
        for group in tqdm(dat.loc[(dat['year'] == year) & (dat['kind'] == kind), 'group'].unique()):
            mask = (dat['year'] == year) & (dat['kind'] == kind) & (dat['group'] == group)
            count = mask.sum()
            assert count > 0
            
            all_mean_embs += [embs[mask, :].mean(axis=0)]
            
            all_uniques += [{
                'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                'year': int(year),
                'kind': kind,
                'group': group,
                'count': count,
                'start': dat.loc[mask, 'timestamp'].min(),
                'end': dat.loc[mask, 'timestamp'].max(),
                'dur': (dat.loc[mask, 'timestamp'].max() - dat.loc[mask, 'timestamp'].min()).total_seconds(),
            }]

all_uniques = pd.DataFrame(all_uniques)
all_mean_embs = np.stack(all_mean_embs, axis=0)

model = st.SentenceTransformer('all-mpnet-base-v2')
probe = 'Pandemic of coronavirus disease 2019'
all_norm_mean_embs = (all_mean_embs / np.linalg.norm(all_mean_embs, axis=1).reshape(-1, 1))
probe_emb = model.encode(probe)
probe_sims = all_norm_mean_embs @ probe_emb[None, ...].T
all_uniques['covid'] = probe_sims
del model

with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques.pkl', 'wb') as f:
    pickle.dump((all_uniques, all_mean_embs), f)

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques.pkl', 'rb') as f:
    all_uniques, all_mean_embs = pickle.load(f)

# Mask out comparisons we don't want

We want stories that overlap in time (incl being in the same year) and are from different media.

In [None]:
start = all_uniques['start'].astype(np.int64) // 10**9
start = np.expand_dims(start.to_numpy(), axis=-1)

end = all_uniques['end'].astype(np.int64) // 10**9
end = np.expand_dims(end.to_numpy(), axis=-1)

time_mask = (start.T < end) & (end.T > start)

In [None]:
years = all_uniques['year'].astype(int).to_numpy()
year_mask = (years == years[:, None])

In [None]:
kinds = all_uniques['kind'].to_numpy()
kind_mask = (kinds != kinds[:, None])

# Compute intercommunity similarity

We want communities that a) overlap in reltime, b) are from the same year, c) are of different kinds

In [None]:
all_norm_mean_embs = (all_mean_embs / np.linalg.norm(all_mean_embs, axis=1).reshape(-1, 1))
all_sims = all_norm_mean_embs @ all_norm_mean_embs.T
row_inds, col_inds = (time_mask & year_mask & kind_mask).nonzero()

te = all_uniques.iloc[row_inds, :]
te = te \
    [['year', 'kind', 'group', 'count', 'covid']] \
   .rename({'year': 'year1', 'kind': 'kind1', 'group': 'group1', 'count': 'count1', 'covid': 'covid1'}, axis=1) \
   .reset_index(drop=True)

tc = all_uniques.iloc[col_inds, :]
tc = tc \
    [['year', 'kind', 'group', 'count', 'covid']] \
   .rename({'year': 'year2', 'kind': 'kind2', 'group': 'group2', 'count': 'count2', 'covid': 'covid2'}, axis=1) \
   .reset_index(drop=True)

matched = pd.concat([te, tc], axis=1)
matched['sim'] = pd.Series(all_sims[row_inds, col_inds], index=matched.index)

matched = matched.loc[
    (matched['kind1'] == 'elite') &
    (matched['kind2'].isin(['decahose', 'radio'])),
:].drop('kind1', axis=1)

matched = matched.rename({
    'year1': 'elite_year',
    'group1': 'elite_group',
    'count1': 'elite_count',
    'covid1': 'elite_covid',
    'year2': 'other_year',
    'kind2': 'other_kind',
    'group2': 'other_group',
    'count2': 'other_count',
    'covid2': 'other_covid',
}, axis=1)

assert (matched['other_kind'].isin(['decahose', 'radio'])).all()
assert matched['elite_year'].isna().sum() == 0
assert matched['other_year'].isna().sum() == 0
assert (matched['elite_year'] == matched['other_year']).all()
matched['elite_year'] = matched['elite_year'].astype(int)
matched['other_year'] = matched['other_year'].astype(int)

# Filter stories

## Examine similarities

### How many total stories?

In [None]:
matched[['other_year', 'other_kind', 'other_group']].drop_duplicates().shape

### Distribution of sims

In [None]:
matched['sim'].describe()

In [None]:
matched['sim'].hist(bins=50)

In [None]:
matched['sim'].hist(by=matched['other_kind'], bins=50)

In [None]:
matched['sim'].hist(by=[matched['other_kind'], matched['other_year']], bins=50, figsize=(10, 10))

In [None]:
matched.shape

In [None]:
matched.groupby([
    matched['other_kind'],
    (matched['other_count'] >= 100)
]).apply(lambda s: s[['other_year', 'other_group']].drop_duplicates().shape[0])

In [None]:
matched[['other_year', 'other_kind', 'other_group']].drop_duplicates().shape

### Covid scores

In [None]:
matched[['elite_covid', 'other_covid']].describe()

In [None]:
matched[['elite_covid', 'other_covid']].hist()

In [None]:
(matched['elite_covid'] - matched['other_covid']).describe()

In [None]:
(matched['elite_covid'] - matched['other_covid']).hist()

In [None]:
matched['elite_covid'].hist(by=matched['elite_year'])

In [None]:
matched['other_covid'].hist(by=matched['other_year'])

In [None]:
((matched['elite_covid'] + matched['other_covid']) / 2).hist(by=matched['elite_year'])

### Grouped

In [None]:
tmp = matched.groupby(['other_year', 'other_kind', 'other_group'])[['sim', 'elite_covid', 'other_covid']].mean().reset_index()

In [None]:
tmp['sim'].describe()

In [None]:
tmp['sim'].hist(by=tmp['other_kind'])

In [None]:
tmp['sim'].hist(by=tmp['other_year'])

In [None]:
tmp['sim'].hist(by=[tmp['other_kind'], tmp['other_year']], figsize=(10, 10))

In [None]:
tmp['elite_covid'].hist(by=[tmp['other_kind'], tmp['other_year']], figsize=(10, 10))

In [None]:
tmp['other_covid'].hist(by=[tmp['other_kind'], tmp['other_year']], figsize=(10, 10))

## Select stories to keep

In [None]:
decahose_threshold = 0.6
radio_threshold = 0.6

In [None]:
# selected = matched.groupby(['other_year', 'other_kind', 'other_group'])[['sim', 'elite_covid', 'other_covid']].mean().reset_index()  # based on average
selected = matched  # based on max

selected = pd.concat([
    selected.loc[
        (selected['other_kind'] == 'decahose') &
        (selected['sim'] >= decahose_threshold)
    ],
    
    selected.loc[
        (selected['other_kind'] == 'radio') &
        (selected['sim'] >= radio_threshold)
    ],
], axis=0)

selected = selected \
    [['other_year', 'other_kind', 'other_group', 'other_covid']] \
    .drop_duplicates() \
    .sample(frac=1) \
    .rename({'other_year': 'year', 'other_kind': 'kind', 'other_group': 'group', 'other_covid': 'covid'}, axis=1)

selected.shape

In [None]:
selected.groupby(['year', 'kind']).size()

## Hand-audit some selected stories

They should be about news and they are. Note we randomly sorted the selected stories; the first few are a random sample of all of them.

In [None]:
# cdf_query_end and cdf_query_inc are as also defined in the 5a notebook which calculates the cdfs
def show_example(year, kind, group_ids, cdf_query_end=2*24*3600, cdf_query_inc=60):
    if isinstance(group_ids, int):
        group_ids = [group_ids]
    
    tmp = dat.loc[(dat['year'] == year) & (dat['kind'] == kind) & dat['group'].isin(group_ids), :].copy()
    tmp['reltime'] -= tmp['reltime'].min()

    cdf_query_pts = np.arange(0, cdf_query_end, cdf_query_inc)
    cdf = ECDF(tmp['reltime'])(cdf_query_pts)
    pdf = np.gradient(cdf)

    with pd.option_context('display.max_colwidth', 0):
        print(f'year: {year}, kind: {kind}, group id(s): ' + ','.join(str(c) for c in group_ids))
        print('number of items: ' + str(tmp.shape[0]))
        display(tmp.sample(min(tmp.shape[0], 10)))
        display(tmp.head(min(tmp.shape[0], 10)))

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))    
    tmp.loc[tmp['reltime'] <= tmp['reltime'].min() + cdf_query_end, :].hist('timestamp', ax=axes[0], xrot=45)

    axes[1].plot(cdf_query_pts, cdf)
    axes[2].plot(cdf_query_pts, pdf)

    axes[0].set_title('Item times')
    axes[1].set_title('ECDF')
    axes[2].set_title('EPDF')

    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    axes[1].xaxis.set_major_formatter(fmt)
    axes[2].xaxis.set_major_formatter(fmt)

In [None]:
i = 0
show_example(
    int(selected.iloc[i, :]['year']),
    selected.iloc[i, :]['kind'],
    int(selected.iloc[i, :]['group']),
)

In [None]:
i = 1
show_example(
    int(selected.iloc[i, :]['year']),
    selected.iloc[i, :]['kind'],
    int(selected.iloc[i, :]['group']),
)

In [None]:
i = 2
show_example(
    int(selected.iloc[i, :]['year']),
    selected.iloc[i, :]['kind'],
    int(selected.iloc[i, :]['group']),
)

In [None]:
i = 3
show_example(
    int(selected.iloc[i, :]['year']),
    selected.iloc[i, :]['kind'],
    int(selected.iloc[i, :]['group']),
)

In [None]:
i = 4
show_example(
    int(selected.iloc[i, :]['year']),
    selected.iloc[i, :]['kind'],
    int(selected.iloc[i, :]['group']),
)

# Write out the selected stories

In [None]:
out = pd.concat([
    selected,
    all_uniques.loc[all_uniques['kind'] == 'elite', ['year', 'kind', 'group', 'covid']],
], axis=0)

out['year'] = out['year'].astype(int)
out['story_id'] = out['year'].astype(str) + '-' + out['kind'] + '-' + out['group'].astype(str)

out.to_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv', index=False)