In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

In [None]:
%%bash

rm -rf data/paper-round-3/replication-datasets/
mkdir -p data/paper-round-3/replication-datasets/

# Manual data

## Twitter

In [None]:
raw_elite = pd.read_csv('data/paper-round-3/event-annotated/raw-elite.csv')[['id']]
assert raw_elite.nunique().item() == raw_elite.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/manual-elite-tweet-ids.csv.gz', 'wt') as f:
    raw_elite.to_csv(f, index=False)

print(raw_elite.shape[0])

raw_decahose = pd.read_csv('data/paper-round-3/event-annotated/raw-decahose.csv')[['id']]
assert raw_decahose.nunique().item() == raw_decahose.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/manual-firehose-tweet-ids.csv.gz', 'wt') as f:
    raw_decahose.to_csv(f, index=False)

print(raw_decahose.shape[0])

## Radio

### Ticks

In [None]:
ticks = pd.read_csv('data/paper-round-3/event-annotated/ticks-radio.csv')
cols = [c for c in ticks.columns if c in ('timestamp', 'total', 'freq') or c.startswith('event_')]
for c in set(list(ticks)) - set(cols):
    ticks.drop(ticks.loc[ticks[c].notna()].index, axis=0, inplace=True)
ticks = ticks.loc[ticks['freq'] == '15min', cols]

ticks.to_csv('data/paper-round-3/replication-datasets/manual-radio-ticks.csv', index=False)

### Item-level data

In [None]:
raw_radio = pd.read_csv('data/paper-round-3/event-annotated/raw-radio.csv')
cols = [c for c in raw_radio.columns if c in ('snippet_id', 'timestamp') or c.startswith('event_')]
raw_radio = raw_radio[cols]

with gzip.open('data/paper-round-3/replication-datasets/manual-radio-raw.csv.gz', 'wt') as f:
    raw_radio.to_csv(f, index=False)

# Automated data

## Load common data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')
    assert dat.index.is_unique

    dat.drop(['has_whisper', 'content'], axis=1, inplace=True)
    dat.drop(dat.loc[dat['year'] > 2021].index, axis=0, inplace=True)
    
print(dat.shape[0])

## Twitter

In [None]:
elite_tweet_ids = dat.loc[dat['kind'] == 'elite'].reset_index()[['id']].copy()
elite_tweet_ids['id'] = elite_tweet_ids['id'].str.replace('E', '').astype(int)
assert elite_tweet_ids['id'].nunique() == elite_tweet_ids.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/auto-elite-tweet-ids.csv.gz', 'wt') as f:
    elite_tweet_ids.to_csv(f, index=False)

print(elite_tweet_ids.shape[0])

firehose_tweet_ids = dat.loc[dat['kind'] == 'decahose'].reset_index()[['id']].copy()
firehose_tweet_ids['id'] = firehose_tweet_ids['id'].str.replace('D', '').astype(int)
assert firehose_tweet_ids['id'].nunique() == firehose_tweet_ids.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/auto-firehose-tweet-ids.csv.gz', 'wt') as f:
    firehose_tweet_ids.to_csv(f, index=False)

print(firehose_tweet_ids.shape[0])

## Radio

### Item-level

In [None]:
dat.drop(dat.loc[dat['kind'] != 'radio'].index, axis=0, inplace=True)

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')
    assert comms.index.is_unique
    assert comms['year'].isna().sum() == 0
dat['group'] = comms['group']
del comms

print(dat.shape[0])

In [None]:
affect_scores = pd.read_csv('data/paper-round-3/event-annotated/auto-qualitative-scores.csv',
                            sep='\t', index_col='id')
assert affect_scores.index.is_unique
dat = dat.merge(affect_scores[['negative', 'emotional', 'outraged']],
                how='left', left_index=True, right_index=True).reset_index()
assert dat['id'].nunique() == dat.shape[0]
del affect_scores

print(dat.shape[0])

In [None]:
def story_id(s):
    if np.isnan(s['group']):
        return np.nan
    else:
        return str(int(s['year'])) + '-' + s['kind'] + '-' + str(int(s['group']))
        
dat['story_id'] = dat.apply(story_id, axis=1)

print(dat.shape[0])

In [None]:
dat['id'] = dat['id'].str.replace('R', '').astype(int)
dat.rename({'id': 'snippet_id'}, axis=1, inplace=True)

In [None]:
assert dat['kind'].unique() == 'radio'
assert dat['snippet_id'].nunique() == dat.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/auto-radio-sample.csv.gz', 'wt') as f:
    dat.to_csv(f, index=False)

print(dat.shape[0])

### Story-level

In [None]:
selected = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv')

selected = selected.loc[selected['kind'] == 'radio'].copy()

selected.to_csv('data/paper-round-3/replication-datasets/auto-radio-story-selected.csv', index=False)

print(selected.shape[0])

In [None]:
stats = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv')
assert stats['story_id'].nunique() == stats.shape[0]

stats.drop(['avg_abs', 'covid_frac'], axis=1, inplace=True)

mask = (stats['kind'] == 'radio')
stats = stats.loc[mask].copy()

stats.to_csv('data/paper-round-3/replication-datasets/auto-radio-story-stats.csv', index=False)

print(stats.shape[0])

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs.npy', 'rb') as f:
    cdfs = np.load(f)

cdfs = cdfs[mask, ...]
assert cdfs.shape[0] == stats.shape[0]

with gzip.open('data/paper-round-3/replication-datasets/auto-radio-story-cdfs.npy.gz', 'wb') as f:
    np.save(f, cdfs)

print(cdfs.shape[0])

In [None]:
stats_lib = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats-lib.csv')
stats_lib.drop(['covid_frac'], axis=1, inplace=True)
mask_lib = (stats_lib['kind'] == 'radio')
stats_lib = stats_lib.loc[mask_lib].copy()

stats_lib.to_csv('data/paper-round-3/replication-datasets/auto-radio-story-stats-lib.csv', index=False)
print(stats_lib.shape[0])

stats_con = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats-con.csv')
stats_con.drop(['covid_frac'], axis=1, inplace=True)
mask_con = (stats_con['kind'] == 'radio')
stats_con = stats_con.loc[mask_con].copy()

stats_con.to_csv('data/paper-round-3/replication-datasets/auto-radio-story-stats-con.csv', index=False)
print(stats_con.shape[0])

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs-lib.npy', 'rb') as f:
    cdfs_lib = np.load(f)
cdfs_lib = cdfs_lib[mask_lib, ...]
assert cdfs_lib.shape[0] == stats_lib.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/auto-radio-story-cdfs-lib.npy.gz', 'wb') as f:
    np.save(f, cdfs_lib)

print(cdfs_lib.shape[0])

with open('data/paper-round-3/event-annotated/auto-story-cdfs-con.npy', 'rb') as f:
    cdfs_con = np.load(f)
cdfs_con = cdfs_con[mask_con, ...]
assert cdfs_con.shape[0] == stats_con.shape[0]
with gzip.open('data/paper-round-3/replication-datasets/auto-radio-story-cdfs-con.npy.gz', 'wb') as f:
    np.save(f, cdfs_con)

print(cdfs_con.shape[0])

# Inspect the results

In [None]:
!ls -lh data/paper-round-3/replication-datasets/