In [None]:
import os, io
import gzip
import random
import pickle
import logging

import numpy as np
import pandas as pd

import networkx as nx
from networkx.algorithms.community import louvain_communities, modularity, partition_quality

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display
from tqdm.notebook import tqdm, trange

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'])

assert dat['id'].nunique() == dat.shape[0]

In [None]:
thresholds = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-sim-thresholds.csv')
thresholds.rename({'split': 'kind'}, axis=1, inplace=True)

thresholds['old_threshold'] = thresholds['threshold']

thresholds['threshold_num_sds'] = 5
# thresholds.loc[(thresholds['kind'] == 'radio') & (thresholds['year'] == 2022), 'threshold_num_sds'] = 3
thresholds['threshold'] = thresholds['mean'] + thresholds['threshold_num_sds'] * thresholds['sd']

# assert (thresholds['threshold'] >= thresholds['old_threshold'] - thresholds['sd']).all()  # edges above this value are kept
thresholds.drop('old_threshold', axis=1, inplace=True)

In [None]:
edges = []
with tqdm() as pbar:
    for root, dirs, files in os.walk('data/paper-round-3/event-annotated/auto-sample-sim-edges/'):
        for name in sorted(files):
            try:
                if name.startswith('2022-'):
                    continue
                
                with gzip.open(os.path.join(root, name), 'rt') as f:
                    tmp = pd.read_csv(f, dtype={'kind': 'category'})

                if tmp.shape[0] == 0:
                    continue
                
                assert tmp['year'].notna().all()
                assert tmp['kind'].notna().all()                    

                tmp['year'] = tmp['year'].astype(int)

                year, kind = tmp.iloc[0]['year'], tmp.iloc[0]['kind']
                threshold = thresholds.loc[(thresholds['year'] == year) & (thresholds['kind'] == kind), 'threshold'].item()
                tmp.drop(tmp.index[tmp['sim'] < threshold], axis=0, inplace=True)
                tmp['threshold'] = threshold
                
                edges += [tmp]
                    
                pbar.update(1)
            except Exception as exc:
                print(root, name)
                raise

edges = pd.concat(edges, axis=0)

assert edges['source'].isin(dat['id']).all()
assert edges['target'].isin(dat['id']).all()

## Examine

In [None]:
thresholds

In [None]:
dat.groupby(['year', 'kind']).size()

In [None]:
edges.groupby(['kind']).size()

In [None]:
edges.groupby(['kind', 'year']).size()

In [None]:
edges.groupby(['kind'])['sim'].describe()

In [None]:
edges.groupby(['kind', 'year'])['sim'].describe()

In [None]:
edges.loc[edges['sim'] >= edges['threshold']].groupby(['kind']).size()

In [None]:
edges.loc[edges['sim'] >= edges['threshold']].groupby(['kind', 'year']).size()

In [None]:
tmp = edges.sample(10000)

In [None]:
tmp['sim'].hist(by=[tmp['kind']], bins=50, figsize=(15, 5), layout=(1, 3))

In [None]:
tmp['sim'].hist(by=[tmp['kind'], tmp['year']], bins=50, figsize=(15, 20), layout=(4, 3))

# Find communities

In [None]:
cache_dir = 'data/paper-round-3/event-annotated/auto-sample-communities/'
os.makedirs(cache_dir, exist_ok=True)

In [None]:
for kind in tqdm(edges.loc[edges['sim'] >= edges['threshold'], 'kind'].unique()):
    for year in tqdm(edges.loc[(edges['sim'] >= edges['threshold']) & (edges['kind'] == kind), 'year'].unique()):
        spec = (kind, year)

        pth = os.path.join(cache_dir, '-'.join([str(s) for s in spec]) + '.pkl')
        if os.path.exists(pth):
            logger.warning(f'File {pth} exists; skipping')
            continue
        
        try:
            G = nx.from_pandas_edgelist(
                edges.loc[(edges['sim'] >= edges['threshold']) & (edges['kind'] == kind) & (edges['year'] == year), :],
                edge_attr='sim',
                create_using=nx.Graph
            )

            tmp_G_stats = (G.order(), G.size())

            tmp_comps = sorted(list(nx.connected_components(G)), key=len, reverse=True)

            comp_lens = pd.Series(len(c) for c in tmp_comps)

            min_comp_size = 300
            max_comp = comp_lens[comp_lens >= min_comp_size].index.max()

            tmp_comms, tmp_comm_stats = [], {}
            for i, c in enumerate(tmp_comps[0:max_comp+1]):
                comp_comms = louvain_communities(
                    G.subgraph(c),
                    weight='sim',
                    resolution=1,
                    seed=seed,
                )

                mod = modularity(G.subgraph(c), comp_comms)
                modw = modularity(G.subgraph(c), comp_comms, weight='sim')
                pq = partition_quality(G.subgraph(c), comp_comms)

                tmp_comms += comp_comms
                tmp_comm_stats[tuple(list(spec) + [i])] = {
                    'modularity': mod,
                    'modularity_weighted': modw,
                    'coverage': pq[0],
                    'performance': pq[1],
                }
        except Exception as exc:
            logger.exception(f'failed on community finding for {spec}')
            raise
        else:
            with open(pth, 'wb') as f:
                pickle.dump(
                    (tmp_G_stats, tmp_comm_stats, tmp_comps, tmp_comms),
                    f
                )

In [None]:
G_stats, comm_stats, comps, comms = {}, {}, {}, {}
for kind in tqdm(edges.loc[edges['sim'] >= edges['threshold'], 'kind'].unique()):
    for year in tqdm(edges.loc[(edges['sim'] >= edges['threshold']) & (edges['kind'] == kind), 'year'].unique()):
        spec = (kind, year)

        pth = os.path.join(cache_dir, '-'.join([str(s) for s in spec]) + '.pkl')
        if not os.path.exists(pth):
            raise RuntimeError(f'File {pth} does not exist')
        
        with open(pth, 'rb') as f:
            tmp_G_stats, tmp_comm_stats, tmp_comps, tmp_comms = pickle.load(f)
            
            G_stats[spec] = tmp_G_stats
            comps[spec] = tmp_comps
            comms[spec] = tmp_comms
            comm_stats.update(tmp_comm_stats)

# Inspect

## Graph sizes

In [None]:
G_stats = pd.DataFrame(G_stats).T.rename({0: 'order', 1: 'size'}, axis=1)

In [None]:
G_stats

## Components

In [None]:
comp_lens = {
    k : pd.Series(len(c) for c in v)
    for k, v in comps.items()
}

In [None]:
pd.DataFrame({ k : v.describe() for k, v in comp_lens.items() }).T

In [None]:
pd.DataFrame({ k : v.head(15) for k, v in comp_lens.items() }).T

## Communities

In [None]:
comm_lens = {
    k : pd.Series(len(c) for c in v)
    for k, v in comms.items()
}

In [None]:
pd.DataFrame({ k : v.describe() for k, v in comm_lens.items() }).T

In [None]:
comm_stats = pd.DataFrame(comm_stats).T

In [None]:
comm_stats

# Assemble communities

In [None]:
assignments = []

for (k, y), obj in comms.items():
    for i, c in enumerate(obj):
        for idv in c:
            assignments += [{
                'kind': k,
                'year': y,
                'group': i,
                'id': idv,
            }]

assignments = pd.DataFrame(assignments)
assert assignments[['kind', 'year', 'id']].duplicated().sum() == 0

In [None]:
# tmp = dat.loc[dat['id'].isin(assignments['id'])]
# tmp['reltime'].hist(by=tmp['year'])

# Write them out

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities.csv.gz', 'wt') as f:
    assignments.to_csv(f, index=False)