In [None]:
import os
import gzip
import random
import logging

import psycopg2
import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Twitter

In [None]:
elite_data = pd.read_csv('data/twitter/tweets-processed.csv', index_col='id',
                         parse_dates=['timestamp'])
elite_data = elite_data.loc[
    ((elite_data['timestamp'] >= '2019-09-01') & (elite_data['timestamp'] <= '2019-11-01')) |
    ((elite_data['timestamp'] >= '2020-03-01') & (elite_data['timestamp'] <= '2020-05-01')),
:]

elite_data = elite_data.loc[~elite_data['content'].isna(), :]

# Decahose

In [None]:
decahose_data = pd.read_csv(
    'data/samples/decahose/decahose-10pct-random-sort-20230410.csv',
    index_col='id',
    parse_dates=['postedtime'],
    nrows=18_000_000,
).rename({'postedtime': 'timestamp', 'body': 'content'}, axis=1)

decahose_data = decahose_data.loc[~decahose_data.index.duplicated(), :]

decahose_data['timestamp'] = decahose_data['timestamp'].dt.tz_localize('utc')

# Radio

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets.csv.gz', 'rt') as f:
    radio_data = pd.read_csv(f, index_col='snippet_id', parse_dates=['timestamp'])

radio_data = radio_data.loc[~radio_data['content'].isna(), :]

# exclude certain common bad lines up front -- we know they're not good,
# why bother with them later
bad_lines = [
    '[noise]',
    '[noise] [noise]',
    '<unk>',
    '[laughter]',
    'thank you',
    'mm',
    'and',
    'the',
    'thanks',
    'a',
    '[noise] [noise] [noise]',
    'oh',
    'um',
    'i',
    'na',
]
radio_data = radio_data.loc[~radio_data['content'].isin(bad_lines), :]

# Combined rebalanced dataset

In [None]:
def make_reltime(s):
    epoch = pd.Timestamp('1970-01-01T00:00:00+00:00')
    
    s = (s - epoch).dt.total_seconds()
    s -= s.min()
    
    return s

## 2019

In [None]:
ed2019 = elite_data \
    .loc[elite_data['timestamp'] < '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='elite', year=2019)

In [None]:
rd2019 = radio_data \
    .loc[radio_data['timestamp'] < '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='radio', year=2019) \
    .rename({'snippet_id': 'id'}, axis=1)

In [None]:
dd2019 = decahose_data \
    .loc[decahose_data['timestamp'] < '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='decahose', year=2019)

In [None]:
(ed2019.shape[0], dd2019.shape[0], rd2019.shape[0])

In [None]:
size = min(ed2019.shape[0], dd2019.shape[0], rd2019.shape[0])

ed2019 = ed2019.sample(n=size, replace=False, random_state=seed)
rd2019 = rd2019.sample(n=size, replace=False, random_state=seed)

# there's a lot of irrelevant cruft in here that isn't about news;
# we filter it out after detecting stories and empirically about
# 1/6 of the content is relevant
dd2019 = dd2019.sample(n=6*size, replace=False, random_state=seed)

In [None]:
dat2019 = pd.concat([ed2019, dd2019, rd2019], axis=0)

dat2019['id'] = dat2019['kind'].map({'elite': 'E', 'radio': 'R', 'decahose': 'D', }) + dat2019['id'].astype(str)
assert dat2019['id'].nunique() == dat2019.shape[0]

dat2019['reltime'] = make_reltime(dat2019['timestamp'])
dat2019 = dat2019.sort_values('reltime')

## 2020

In [None]:
ed2020 = elite_data \
    .loc[elite_data['timestamp'] >= '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='elite', year=2020)

In [None]:
dd2020 = decahose_data \
    .loc[decahose_data['timestamp'] >= '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='decahose', year=2020)

In [None]:
rd2020 = radio_data \
    .loc[radio_data['timestamp'] >= '2020-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='radio', year=2020) \
    .rename({'snippet_id': 'id'}, axis=1)

In [None]:
(ed2020.shape[0], dd2020.shape[0], rd2020.shape[0])

In [None]:
size = min(ed2020.shape[0], dd2020.shape[0], rd2020.shape[0])

ed2020 = ed2020.sample(n=size, replace=False, random_state=seed)
rd2020 = rd2020.sample(n=size, replace=False, random_state=seed)

# there's a lot of irrelevant cruft in here that isn't about news;
# we filter it out after detecting stories and empirically about
# 1/6 of the content is relevant
dd2020 = dd2020.sample(n=6*size, replace=False, random_state=seed)

In [None]:
dat2020 = pd.concat([ed2020, dd2020, rd2020], axis=0)

dat2020['id'] = dat2020['kind'].map({'elite': 'E', 'radio': 'R', 'decahose': 'D', }) + dat2020['id'].astype(str)
assert dat2020['id'].nunique() == dat2020.shape[0]

dat2020['reltime'] = make_reltime(dat2020['timestamp'])
dat2020 = dat2020.sort_values('reltime')

## Combine

In [None]:
dat = pd.concat([dat2019, dat2020], axis=0)

assert dat['id'].nunique() == dat.shape[0]

In [None]:
dat = dat.loc[(~dat['content'].isna()) & (dat['content'] != ''), :]

# Inspect

In [None]:
dat['kind'].value_counts()

In [None]:
dat['year'].value_counts()

In [None]:
dat.groupby(['year', 'kind']).size()

In [None]:
dat['reltime'].describe()

In [None]:
dat.groupby('year')['reltime'].describe()

In [None]:
dat.groupby(['year', 'kind'])['reltime'].describe()

In [None]:
with pd.option_context('max_colwidth', None):
    display(dat.sample(10))

# Write out

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-pre-whisper.csv.gz', 'wt') as f:
    dat.to_csv(f, index=False)