In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

import transformers as tf
import torch

from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Utilities

In [None]:
def grouper(it, n=None):
    assert n is None or n > 0

    if n is None:
        yield [x for x in it]
    else:
        ret = []

        for obj in it:
            if len(ret) == n:
                yield ret
                ret = []

            if len(ret) < n:
                ret += [obj]

        # at this point, we're out of
        # objects but len(ret) < n
        if len(ret) > 0:
            yield ret

# Load data

## Item-level

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique

comms.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, index_col='id', dtype={'year': int}, parse_dates=['timestamp'])

assert dat.index.is_unique

dat.shape

In [None]:
dat['group'] = comms['group']
dat = dat.loc[dat['group'].notna()]  # only selected stories, keep this manageably small
dat['group'] = dat['group'].astype(int)
dat['story_id'] = dat['year'].astype(str) + '-' + dat['kind'] + '-' + dat['group'].astype(str)

assert dat['story_id'].isna().sum() == 0

del comms

## Story-level

In [None]:
stats = pd.read_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index_col='story_id')
assert stats.index.is_unique

assert (stats.groupby(['year', 'kind']).size() == stats.groupby(['year', 'kind'])['group'].nunique()).all()

In [None]:
selected = pd.read_csv('data/paper-round-3/event-annotated/auto-sample-communities-filter-list.csv', index_col='story_id')
assert selected.index.is_unique

## Filter to only selected stories

In [None]:
length_mask = (stats['count'] >= 10)
selected_mask = stats.index.isin(selected.index)

mask = (
    length_mask
    & selected_mask
)

mask.sum(), stats.shape[0]

In [None]:
tmp_dat = dat.loc[dat['story_id'].isin(stats.loc[mask].index)]

# sorting by length (in characters, as a proxy for length in tokens) makes
# the batching much more efficient, wastes less compute, finishes faster
tmp_dat = tmp_dat.loc[tmp_dat['content'].str.len().sort_values(ascending=False).index, :]

tmp_dat = tmp_dat.reset_index()

tmp_dat.shape

# Calculate various kinds of sentiment

In [None]:
device = 'cuda:1'
modspec = 'facebook/bart-large-mnli'

In [None]:
model = tf.AutoModelForSequenceClassification.from_pretrained(modspec).to(device)
tokenizer = tf.AutoTokenizer.from_pretrained(modspec)

In [None]:
hypothesis = 'This example is {label}.'

labels = [
    'negative',
    'emotional',
    'outraged',
]

In [None]:
batch_size = 64
texts = tmp_dat['content'].tolist()
idx = tmp_dat.index

# # uncomment to run only a few batches
# n_batches = 50
# total = batch_size * n_batches
# texts = texts[0:total]
# idx = idx[0:total]

scores = []
for batch in tqdm(grouper(texts, batch_size), total=int(np.ceil(len(texts) / batch_size))):
    out = []
    for lab in labels:
        hyp = [hypothesis.format(label=lab) for b in batch]
        x = tokenizer(batch, hyp, return_tensors='pt', padding='longest', truncation=True)
        input_ids, attention_mask = x['input_ids'].to(device), x['attention_mask'].to(device)
        
        with torch.no_grad():
            logits = model(input_ids, attention_mask)[0]

        # we throw away "neutral" (dim 1) and take the probability of
        # "entailment" (2) as the probability of the label being true 
        probs = logits[:,[0,2]].softmax(dim=1)
        probs = probs[:,1]
        
        out += [probs.cpu().numpy()]
    out = np.stack(out, axis=1)
    scores += [out]

scores = np.concatenate(scores, axis=0)
scores = pd.DataFrame(scores, columns=labels)
scores.index = idx
scores = pd.concat([
    scores,
    tmp_dat.loc[idx, ['id', 'kind', 'year', 'group', 'reltime', 'story_id']]
], axis=1)

# Write out

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-qualitative-scores.csv.gz', 'wt') as f:
    scores.to_csv(f, sep='\t', index=False)