In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

# Params

In [None]:
n_blocks = 4
block_len = 4000  # in seconds
thresh_sds = 5

In [None]:
# just to be safe
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

rng = np.random.default_rng(seed=seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'])

assert dat['id'].nunique() == dat.shape[0]

In [None]:
with open('data/paper-round-3/event-annotated/auto-sample-embeds.npy', 'rb') as f:
    embs = np.load(f)

# Estimate mean and SD of sims

In [None]:
sims = []
for year in tqdm(dat['year'].unique()):
    year_mask = dat['timestamp'].dt.year == year
    
    year_kinds = dat.loc[year_mask, 'kind']
    year_reltimes = dat.loc[year_mask, 'reltime']
    year_reltimes -= year_reltimes.min()
    year_embs = embs[year_mask, :]
    
    total = int(np.ceil((year_reltimes.max() - year_reltimes.min()) / block_len))
    starts = pd.Series(np.arange(total)).sample(frac=1, random_state=seed)
    ends = starts.apply(lambda p: min(p + n_blocks, total))
    
    year_sims = {}
    for kind in tqdm(dat['kind'].unique()):
        kind_sims = []
        kind_mask = (year_kinds == kind)
        
        for start, end in tqdm(zip(starts, ends), total=len(starts)):
            time_mask = (year_reltimes >= start * block_len) & (year_reltimes <= end * block_len)
            mask = kind_mask & time_mask
            if mask.sum() == 0:
                continue

            if start == starts.max() and mask.sum() < 10000:  # last batch can be small
                continue

            p0 = rng.permutation(np.asarray(range(mask.sum())))[0:100]
            p1 = rng.permutation(np.asarray(range(mask.sum())))[0:100]

            tmp = year_embs[mask, :][p0, :] @ year_embs[mask, :][p1, :].T
            tmp = pd.Series(tmp.flatten())

            kind_sims += [tmp]
        kind_sims = pd.concat(kind_sims, axis=0)

        year_sims[kind] = kind_sims.rename(kind)

    year_sims = pd.concat([
        v.describe()
        for k, v in year_sims.items()
    ], axis=1).loc[['mean', 'std'], :]
    year_sims = year_sims.T.assign(year=year).reset_index().rename({'index': 'kind'}, axis=1)

    sims += [year_sims]
    
sims = pd.concat(sims, axis=0).rename({'std': 'sd'}, axis=1)
sims['year'] = sims['year'].astype(int)

sims['threshold_num_sds'] = thresh_sds
sims.loc[(sims['kind'] == 'decahose'), 'threshold_num_sds'] = 4
sims.loc[(sims['kind'] == 'radio') & (sims['year'] == 2022), 'threshold_num_sds'] = 3
sims['threshold'] = sims['mean'] + sims['threshold_num_sds'] * sims['sd']

## Threshold figures

In [None]:
with pd.option_context('display.float_format', lambda x: '%.5f' % x):
    display(sims)

# Save thresholds

In [None]:
sims.to_csv('data/paper-round-3/event-annotated/auto-sample-sim-thresholds.csv', index=False)