In [None]:
import os
import gzip
import random
import pickle
import logging

import numpy as np
import pandas as pd

import networkx as nx

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display
from tqdm.notebook import tqdm, trange

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')

assert dat.index.is_unique

dat.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique

comms.shape

In [None]:
dat['group'] = comms['group']
has_group_mask = dat['group'].notna()
dat = dat.loc[has_group_mask, :]

dat['group'] = dat['group'].astype(int)
dat['year'] = dat['year'].astype(int)

dat = dat.reset_index()

# Average embeddings by community

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-embeds.npy', 'rb') as f:
    embs = np.load(f)
embs = embs[has_group_mask, :]

all_uniques, all_mean_embs = [], []
for year in tqdm(dat['year'].unique()):
    for kind in tqdm(dat.loc[dat['year'] == year, 'kind'].unique()):
        for group in tqdm(dat.loc[(dat['year'] == year) & (dat['kind'] == kind), 'group'].unique()):
            mask = (dat['year'] == year) & (dat['kind'] == kind) & (dat['group'] == group)
            count = mask.sum()
            assert count > 0
            
            all_mean_embs += [embs[mask, :].mean(axis=0)]
            
            all_uniques += [{
                'year': year,
                'kind': kind,
                'group': group,
                'count': count,
                'start': dat.loc[mask, 'timestamp'].min(),
                'end': dat.loc[mask, 'timestamp'].max(),
                'dur': (dat.loc[mask, 'timestamp'].max() - dat.loc[mask, 'timestamp'].min()).total_seconds(),
            }]

all_uniques = pd.DataFrame(all_uniques)
all_mean_embs = np.stack(all_mean_embs, axis=0)

with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques-pre-filter.pkl', 'wb') as f:
    pickle.dump((all_uniques, all_mean_embs), f)

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-mean-embs-uniques-pre-filter.pkl', 'rb') as f:
    all_uniques, all_mean_embs = pickle.load(f)

# Mask out comparisons we don't want

We only want stories that don't overlap, and we're running newsLens separately by year and medium.

In [None]:
start = all_uniques['start'].astype(np.int64) // 10**9
start = np.expand_dims(start.to_numpy(), axis=-1)

end = all_uniques['end'].astype(np.int64) // 10**9
end = np.expand_dims(end.to_numpy(), axis=-1)

time_mask = (start.T < end) & (end.T > start)
time_mask = np.logical_not(time_mask)

In [None]:
years = all_uniques['year'].astype(int).to_numpy()
year_mask = (years == years[:, None])

In [None]:
kinds = all_uniques['kind'].to_numpy()
kind_mask = (kinds == kinds[:, None])

# Compute intercommunity similarity

In [None]:
all_norm_mean_embs = (all_mean_embs / np.linalg.norm(all_mean_embs, axis=1).reshape(-1, 1))
all_sims = all_norm_mean_embs @ all_norm_mean_embs.T

all_sims = np.triu(all_sims, 1)  # no self-comparisons
all_sims = np.where(time_mask & year_mask & kind_mask, all_sims, np.nan)

# Find a threshold

In [None]:
# any specific subset we want to single out for selection of threshold
mask = all_uniques['count'] >= 10

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x):
    display(pd.Series(all_sims[np.triu_indices(all_sims.shape[0], 1)].flatten()).describe())

In [None]:
with pd.option_context('display.float_format', lambda x: '%.3f' % x):
    display(pd.Series(all_sims[mask, :][:, mask][np.triu_indices(mask.sum(), 1)].flatten()).describe())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

pd.Series(all_sims[np.triu_indices(all_sims.shape[0], 1)].flatten()).hist(bins=50, ax=axes[0])
axes[0].set_title('all sims')

pd.Series(all_sims[mask, :][:, mask][np.triu_indices(mask.sum(), 1)].flatten()).hist(bins=50, ax=axes[1])
axes[1].set_title('mask subset')

In [None]:
thresh = np.nanpercentile(all_sims[np.triu_indices(all_sims.shape[0], 1)].flatten(), 99.99)

thresh

# Merge matches

In [None]:
matched = pd.DataFrame(np.stack(np.where((all_sims > thresh)), axis=1))
matched = matched.rename({0: 'source', 1: 'target'}, axis=1)

matched.shape

In [None]:
G = nx.from_pandas_edgelist(matched, create_using=nx.Graph)

In [None]:
match_comps = sorted(
    list(nx.connected_components(G)),
    key=lambda c: len(c),
    reverse=True
)

# Inspect matches

In [None]:
# check: are these unique (as they should be?)
assert sum([len(c) for c in match_comps]) == len(set([x for y in match_comps for x in y]))

# how many stories are being merged?
sum([len(c) for c in match_comps])

In [None]:
# how many new stories?
len(match_comps)

In [None]:
# does the rate of selection to be merged differ between our mask subset
# and the rest of the set of stories?
tmp = pd.Series((0 for _ in range(all_uniques.shape[0])), index=all_uniques.index)
tmp.iloc[list(G.nodes)] = 1

tmp.groupby(mask).describe()

In [None]:
# does the rate of selection to be merged differ between Twitter and radio?
tmp = pd.Series((0 for _ in range(all_uniques.shape[0])), index=all_uniques.index)
tmp.iloc[list(G.nodes)] = 1

tmp.groupby(all_uniques['kind']).describe()

In [None]:
# does the rate of selection to be merged differ between 2019 and 2020?
tmp = pd.Series((0 for _ in range(all_uniques.shape[0])), index=all_uniques.index)
tmp.iloc[list(G.nodes)] = 1

tmp.groupby(all_uniques['year']).describe()

In [None]:
matched

In [None]:
# how many to-be-merged groups of 2, 3, ..., are there?
pd.Series([len(c) for c in match_comps]).value_counts().sort_index()

In [None]:
# Examine a random match
with pd.option_context('display.max_colwidth', 0):
    i1, i2 = matched.sample(1).iloc[0, :].tolist()
    
    year1  = all_uniques.iloc[i1, :]['year']
    kind1  = all_uniques.iloc[i1, :]['kind']
    group1 = all_uniques.iloc[i1, :]['group']
    
    year2  = all_uniques.iloc[i2, :]['year']
    kind2  = all_uniques.iloc[i2, :]['kind']
    group2 = all_uniques.iloc[i2, :]['group']

    tmp1 = dat.loc[(dat['year'] == year1) & (dat['kind'] == kind1) & (dat['group'] == group1), :]
    
    tmp2 = dat.loc[(dat['year'] == year2) & (dat['kind'] == kind2) & (dat['group'] == group2), :]
        
    display(all_uniques.iloc[i1, :])
    display(tmp1.sample(min(tmp1.shape[0], 10)))
    
    print('\n')
    
    display(all_uniques.iloc[i2, :])
    display(tmp2.sample(min(tmp2.shape[0], 10)))

In [None]:
# how far apart in time are the stories we're merging?
dates = pd.DataFrame([
    [
        i,
        all_uniques.loc[list(comp), 'kind'].unique().item(),
        len(comp),
        
        all_uniques.loc[list(comp), 'start'].min(), all_uniques.loc[list(comp), 'start'].max(),
        all_uniques.loc[list(comp), 'end'].min(), all_uniques.loc[list(comp), 'end'].max(),
        
        (all_uniques.loc[list(comp), 'end'] - all_uniques.loc[list(comp), 'start']).mean(),
    ]
    for i, comp in enumerate(match_comps)
], columns=['match_comps_ind', 'kind', 'num_merged_stories', 'start_min', 'start_max', 'end_min', 'end_max', 'avg_story_duration'])

dates = dates.set_index('match_comps_ind')
dates['merged_duration'] = dates['end_max'] - dates['start_min']

dates

# Write out merged communities

In [None]:
comms_merged = comms.copy()
comms_merged['group_pre_merge'] = comms_merged['group'].copy()

for c in match_comps:
    members = all_uniques.iloc[list(c)]
    
    year = members['year'].unique().item()
    kind = members['kind'].unique().item()
    groups = members['group'].unique()
    new_group_id = groups.max()
    
    comms_merged.loc[
        (comms_merged['year'] == year) &
        (comms_merged['kind'] == kind) &
        comms_merged['group'].isin(groups),
        
        'group'
    ] = new_group_id

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'wt') as f:
    comms_merged.to_csv(f, index=True)