In [None]:
import os
import gzip
import random
import pickle
import logging

import numpy as np
import pandas as pd
import networkx as nx

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from statsmodels.distributions.empirical_distribution import ECDF

from IPython.display import display
from tqdm.notebook import tqdm, trange

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')

assert dat.index.is_unique

dat.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique
assert comms['year'].isna().sum() == 0

comms.shape

In [None]:
dat['group'] = comms['group']
has_group_mask = dat['group'].notna()
dat = dat.loc[has_group_mask, :]

dat['group'] = dat['group'].astype(int)
dat['year'] = dat['year'].astype(int)

# Average embeddings by community

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques.pkl', 'rb') as f:
    all_uniques, all_mean_embs = pickle.load(f)

# Mask out comparisons we don't want

We want stories that overlap in time (incl being in the same year) and are from different media.

In [None]:
start = all_uniques['start'].astype(np.int64) // 10**9
start = np.expand_dims(start.to_numpy(), axis=-1)

end = all_uniques['end'].astype(np.int64) // 10**9
end = np.expand_dims(end.to_numpy(), axis=-1)

time_mask = (start.T < end) & (end.T > start)

In [None]:
years = all_uniques['year'].astype(int).to_numpy()
year_mask = (years == years[:, None])

In [None]:
kinds = all_uniques['kind'].to_numpy()
kind_mask = (kinds != kinds[:, None])

# Compute intercommunity similarity

We want communities that a) overlap in reltime, b) are from the same year, c) are of different kinds

In [None]:
all_norm_mean_embs = (all_mean_embs / np.linalg.norm(all_mean_embs, axis=1).reshape(-1, 1))
all_sims = all_norm_mean_embs @ all_norm_mean_embs.T
row_inds, col_inds = (time_mask & year_mask & kind_mask).nonzero()

te = all_uniques.iloc[row_inds, :]
te = te \
    [['year', 'kind', 'group', 'count', 'dur']] \
   .rename({'year': 'year1', 'kind': 'kind1', 'group': 'group1', 'count': 'count1', 'dur': 'dur1'}, axis=1) \
   .reset_index(drop=True)

tc = all_uniques.iloc[col_inds, :]
tc = tc \
    [['year', 'kind', 'group', 'count', 'dur']] \
   .rename({'year': 'year2', 'kind': 'kind2', 'group': 'group2', 'count': 'count2', 'dur': 'dur2'}, axis=1) \
   .reset_index(drop=True)

matchable = pd.concat([te, tc], axis=1)
matchable['sim'] = pd.Series(all_sims[row_inds, col_inds], index=matchable.index)

assert matchable['year1'].isna().sum() == 0
assert matchable['year2'].isna().sum() == 0
matchable['year1'] = matchable['year1'].astype(int)
matchable['year2'] = matchable['year2'].astype(int)
assert (matchable['year1'] == matchable['year2']).all()

matchable['story_id1'] = matchable['year1'].astype(str) + '-' + matchable['kind1'] + '-' + matchable['group1'].astype(str)
matchable['story_id2'] = matchable['year2'].astype(str) + '-' + matchable['kind2'] + '-' + matchable['group2'].astype(str)

matchable['count_ratio'] = matchable[['count1', 'count2']].max(axis=1) / matchable[['count1', 'count2']].min(axis=1)
matchable['dur_ratio'] = matchable[['dur1', 'dur2']].max(axis=1) / matchable[['dur1', 'dur2']].min(axis=1)

# deduplicate
matchable = matchable.loc[matchable['story_id1'] < matchable['story_id2'], :]

matchable = matchable.set_index(['story_id1', 'story_id2'])

# Examine similarities

In [None]:
matchable.describe().T

In [None]:
matchable['sim'].describe()

In [None]:
matchable['sim'].hist(bins=50)

In [None]:
matchable['sim'].hist(by=[matchable['kind1'], matchable['kind2']], bins=50, figsize=(10, 10))

In [None]:
matchable['sim'].hist(by=[matchable['year1'], matchable['kind1'], matchable['kind2']], bins=50, figsize=(10, 10))

# Matching

In [None]:
min_story_size = 30
count_ratio_max = 7
dur_ratio_max = 3
sim_threshold = 0.5

## elite $\times$ radio

In [None]:
tmp_matchable = matchable.loc[
    (matchable['kind1'] != 'decahose') &
    (matchable['kind2'] != 'decahose') &
    
    (matchable['count1'] >= min_story_size) &
    (matchable['count2'] >= min_story_size) &
    
   (matchable['count_ratio'] <= count_ratio_max) &
   (matchable['dur_ratio'] <= dur_ratio_max),
:].reset_index()

G = nx.from_pandas_edgelist(
    tmp_matchable,
    source='story_id1',
    target='story_id2',
    edge_attr='sim',
    create_using=nx.Graph
)

er_matched = list(nx.max_weight_matching(G, weight='sim'))

er_matched = pd.DataFrame(
    [[s[0]] + [s[1]] for s in er_matched] +
    [[s[1]] + [s[0]] for s in er_matched],
    
    columns=['story_id1', 'story_id2'],
)

er_matched = er_matched.loc[er_matched['story_id1'] < er_matched['story_id2'], :]
er_matched = matchable.reset_index().merge(er_matched, how='inner', on=['story_id1', 'story_id2'])

er_matched = er_matched.rename({
    'story_id1': 'story_id_elite',
    'year1': 'year_elite',
    'kind1': 'kind_elite',
    'group1': 'group_elite',
    'count1': 'count_elite',
    'dur1': 'dur_elite',
    
    'story_id2': 'story_id_radio',
    'year2': 'year_radio',
    'kind2': 'kind_radio',
    'group2': 'group_radio',
    'count2': 'count_radio',
    'dur2': 'dur_radio',
    
    'sim': 'sim_elite_radio',
    'count_ratio': 'count_ratio_elite_radio',
    'dur_ratio': 'dur_ratio_elite_radio',
}, axis=1)

## (elite $\times$ radio) $\times$ decahose

In [None]:
tmp_matchable = matchable.loc[
    (
        (
            matchable.index.get_level_values(0).isin(er_matched['story_id_elite']) |
            matchable.index.get_level_values(0).isin(er_matched['story_id_radio'])
        ) &
        (matchable['kind2'] == 'decahose') &
        (matchable['count2'] >= min_story_size) &
        (matchable['count_ratio'] <= count_ratio_max) &
        (matchable['dur_ratio'] <= dur_ratio_max)
    ) |
    (
        (
            matchable.index.get_level_values(1).isin(er_matched['story_id_elite']) |
            matchable.index.get_level_values(1).isin(er_matched['story_id_radio'])
        ) &
        (matchable['kind1'] == 'decahose') &
        (matchable['count1'] >= min_story_size) &
        (matchable['count_ratio'] <= count_ratio_max) &
        (matchable['dur_ratio'] <= dur_ratio_max)
    ),
:].reset_index()

G = nx.from_pandas_edgelist(
    tmp_matchable,
    source='story_id1',
    target='story_id2',
    edge_attr='sim',
    create_using=nx.Graph
)

dh_matched = list(nx.max_weight_matching(G, weight='sim'))

dh_matched = pd.DataFrame(
    [[s[0]] + [s[1]] for s in dh_matched] +
    [[s[1]] + [s[0]] for s in dh_matched],
    
    columns=['story_id1', 'story_id2'],
)

dh_matched = dh_matched.loc[dh_matched['story_id1'] < dh_matched['story_id2'], :]
dh_matched = matchable.reset_index().merge(dh_matched, how='inner', on=['story_id1', 'story_id2'])

dh_matched = dh_matched.drop(['year2', 'kind2', 'group2', 'count2', 'dur2'], axis=1)
dh_matched = dh_matched.rename({
    'story_id1': 'story_id_decahose',
    'year1': 'year_decahose',
    'kind1': 'kind_decahose',
    'group1': 'group_decahose',
    'count1': 'count_decahose',
    'dur1': 'dur_decahose',
    
    'story_id2': 'story_id_other',
    'sim': 'sim_decahose_other',
    'count_ratio': 'count_ratio_decahose_other',
    'dur_ratio': 'dur_ratio_decahose_other',
}, axis=1)

## Combine

In [None]:
matched = pd.concat([
    er_matched
        .merge(dh_matched, how='left', left_on='story_id_elite', right_on='story_id_other'),
    
    er_matched
        .merge(dh_matched, how='left', left_on='story_id_radio', right_on='story_id_other')
], axis=0)

matched['sim_decahose_other'] = matched['sim_decahose_other'].fillna(-np.inf)
matched = matched.iloc[matched.groupby(['story_id_elite', 'story_id_radio'])['sim_decahose_other'].idxmax(), :]
matched['sim_decahose_other'] = matched['sim_decahose_other'].replace(-np.inf, np.nan)

assert (matched['kind_elite'] == 'elite').all()
assert (matched['kind_radio'] == 'radio').all()
assert ((matched['kind_decahose'] == 'decahose') | matched['kind_decahose'].isna()).all()
assert (matched['year_elite'] == matched['year_radio']).all()
assert ((matched['year_elite'] == matched['year_decahose']) | matched['year_decahose'].isna()).all()

matched = matched.drop(['kind_elite', 'kind_radio', 'kind_decahose', 'year_radio', 'year_decahose'], axis=1)
matched = matched.rename({'year_elite': 'year', 'story_id_other': 'story_id_decahose_matched'}, axis=1)

matched = matched.sample(frac=1)
matched = matched.reset_index(drop=True)

In [None]:
matched.shape

In [None]:
matched.groupby('year').size()

In [None]:
matched.groupby([
    (matched['sim_elite_radio'] >= sim_threshold),
    (matched['sim_decahose_other'] >= sim_threshold)
]).size()

In [None]:
matched = matched.loc[
    (
        (matched['sim_elite_radio'] >= sim_threshold) &
        (matched['sim_decahose_other'] >= sim_threshold)
    )
]

## Hand-audit some selected stories

They should be about news and they are. Note we randomly sorted the selected stories; the first few are a random sample of all of them.

In [None]:
matched.describe().T

In [None]:
# cdf_query_end and cdf_query_inc are as also defined in the 5a notebook which calculates the cdfs
def show_example(year, kind, group_ids, cdf_query_end=2*24*3600, cdf_query_inc=60):
    if isinstance(group_ids, int):
        group_ids = [group_ids]
    
    tmp = dat.loc[(dat['year'] == year) & (dat['kind'] == kind) & dat['group'].isin(group_ids), :].copy()
    tmp['reltime'] -= tmp['reltime'].min()

    cdf_query_pts = np.arange(0, cdf_query_end, cdf_query_inc)
    cdf = ECDF(tmp['reltime'])(cdf_query_pts)
    pdf = np.gradient(cdf)

    with pd.option_context('display.max_colwidth', 0):
        print(f'year: {year}, kind: {kind}, group id(s): ' + ','.join(str(c) for c in group_ids))
        print('number of items: ' + str(tmp.shape[0]))
        display(tmp.sample(min(tmp.shape[0], 10)))
        # display(tmp.head(min(tmp.shape[0], 10)))

    fig, axes = plt.subplots(1, 3, figsize=(15, 5))    
    tmp.loc[tmp['reltime'] <= tmp['reltime'].min() + cdf_query_end, :].hist('timestamp', ax=axes[0], xrot=45)

    axes[1].plot(cdf_query_pts, cdf)
    axes[2].plot(cdf_query_pts, pdf)

    axes[0].set_title('Item times')
    axes[1].set_title('ECDF')
    axes[2].set_title('EPDF')

    fmt = mp.ticker.FuncFormatter(lambda x, pos: f'{x / 3600:.0f}h')
    axes[1].xaxis.set_major_formatter(fmt)
    axes[2].xaxis.set_major_formatter(fmt)

In [None]:
i = 0

In [None]:
print(matched.iloc[i, :]['sim_elite_radio'])

In [None]:
show_example(
    int(matched.iloc[i, :]['year']),
    'elite',
    int(matched.iloc[i, :]['group_elite']),
)

In [None]:
show_example(
    int(matched.iloc[i, :]['year']),
    'radio',
    int(matched.iloc[i, :]['group_radio']),
)

# Write out the selected stories

In [None]:
matched.to_csv('data/paper-round-3/event-annotated/auto-sample-communities-matching.csv', index=False)