In [None]:
import os
import re
import csv
import json
import gzip
import random
import logging

import numpy as np
import pandas as pd

from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Twitter

In [None]:
user_ids = pd.read_csv('data/twitter/tweets-processed.csv', index_col='id',
                       parse_dates=['timestamp'])['user_id'].unique()

## 2021

In [None]:
tw2021 = pd.read_csv('data/paper-round-3/twitter/social-polls-2021.csv.gz', sep='\t')

# this has some users who are pollsters rather than elite journo types, which we don't want
tw2021 = tw2021.loc[tw2021['user_id'].isin(user_ids)]

assert tw2021['id'].nunique() == tw2021['id'].shape[0]

## 2022

In [None]:
rx = '{"info":{"message":"Replay Request Completed","sent":"[^"]+","activity_count":[0-9]+}}'

tw2022 = []
with gzip.open('data/paper-round-3/twitter/elite-twitter-2022.jsonl.gz', 'rt') as f:
    for i, line in enumerate(f):
        if line == '':
            continue
        
        line = re.sub(rx, '', line)
        
        try:
            tw2022 += [json.loads(line)]
        except json.JSONDecodeError:
            print(line)

tw2022 = pd.DataFrame(tw2022)
assert tw2022['id'].nunique() == tw2022['id'].shape[0]

In [None]:
def get_tw2022_content(row):
    is_rt = (row['body'].startswith('RT @'))
    screen_name = row['actor']['preferredUsername']
    assert isinstance(screen_name, str)
    rt_prefix = f'RT @{screen_name}: '
    
    options = [row['body']]
    
    if (
        isinstance(row['long_object'], dict) and
        'body' in row['long_object'].keys()
    ):
        options += [row['long_object']['body']]
    
    if 'body' in row['object'].keys():
        options += [row['object']['body']]

    if (
        'long_object' in row['object'].keys() and
        'body' in row['object']['long_object']
    ):
        options += [row['object']['long_object']['body']]

    if (
        'object' in row['object'].keys() and
        'body' in row['object']['object']
    ):
        options += [row['object']['object']['body']]

    for i in range(len(options)):
        if is_rt and not options[i].startswith('RT @'):
            options[i] = rt_prefix + options[i]
    
    for opt in options:
        if '…' not in opt:
            return opt
    
    return sorted(options, key=len, reverse=True)[0]

tw2022['content'] = tw2022.apply(get_tw2022_content, axis=1)

tw2022['is_retweet'] = (tw2022['verb'] == 'share')

assert tw2022['provider'].apply(lambda s: s['displayName']).unique().tolist() == ['Twitter']

assert tw2022['objectType'].unique().tolist() == ['activity']

tw2022['source'] = tw2022['generator'].apply(lambda s: s['displayName'])

tw2022['source_collapsed'] = tw2022['source'].map({
    'Twitter for iPhone': 'iPhone',
    'Twitter for Android': 'Android',
    'Twitter Web App': 'Desktop',
    'Twitter Web Client': 'Desktop',
    'TweetDeck': 'Desktop',
}).fillna('Other')

tw2022['is_reply'] = tw2022['inReplyTo'].notna()
tw2022['is_quote_tweet'] = tw2022['twitter_quoted_status'].notna()

tw2022['user_id'] = tw2022['actor'] \
    .apply(lambda s: s['id']) \
    .str.replace('id:twitter.com:', '', regex=False) \
    .astype(int)

tw2022['truncated'] = False

tw2022['id'] = tw2022['id'].str.replace('tag:search.twitter.com,2005:', '', regex=False)

tw2022.rename({
    'retweetCount': 'retweet_count',
    'favoritesCount': 'favorite_count',
    'twitter_lang': 'lang',
    'postedTime': 'timestamp',
}, axis=1, inplace=True)

tw2022 = tw2022[['id', 'user_id', 'content', 'timestamp',
                 'lang', 'source', 'truncated', 'is_retweet',
                 'is_reply', 'is_quote_tweet', 'retweet_count',
                 'favorite_count', 'source_collapsed']]

## Combine

In [None]:
elite_data = pd.concat([tw2021, tw2022], axis=0).set_index('id')

elite_data['timestamp'] = pd.to_datetime(elite_data['timestamp'])

elite_data = elite_data.loc[
    ((elite_data['timestamp'] >= '2021-01-01') & (elite_data['timestamp'] <= '2021-03-01')) |
    ((elite_data['timestamp'] >= '2022-03-01') & (elite_data['timestamp'] <= '2022-05-01')),
:]

elite_data = elite_data.loc[~elite_data['content'].isna(), :]

with gzip.open('data/paper-round-3/twitter/new-data-processed.jsonl.gz', 'wt') as f:
    elite_data.reset_index().to_json(f, orient='records', lines=True)
    
del tw2022, tw2021

In [None]:
with gzip.open('data/paper-round-3/twitter/new-data-processed.jsonl.gz', 'rt') as f:
    elite_data = pd.read_json(f, lines=True)

elite_data.set_index('id', inplace=True)
elite_data['timestamp'] = pd.to_datetime(elite_data['timestamp'])
elite_data['timestamp'] = elite_data['timestamp'].dt.tz_localize('utc')

# Decahose

In [None]:
total = 317774406  # zcat decahose-10pct-random-sort-20230509.csv.gz | grep '^"[0-9]\+"' | wc -l
take = (np.random.random((total,)) <= 0.057)  # ~18m rows

decahose_data = []
with gzip.open('data/paper-round-3/decahose/decahose-10pct-random-sort-20230509.csv.gz', 'rt') as f:
    reader = csv.DictReader(f)
    for i, row in tqdm(enumerate(reader), total=total):
        if take[i]:
            decahose_data += [row]
decahose_data = pd.DataFrame(decahose_data)

decahose_data.set_index('id', inplace=True)
decahose_data = decahose_data.loc[~decahose_data.index.duplicated(), :]

decahose_data.rename({'postedtime': 'timestamp', 'body': 'content'}, axis=1, inplace=True)
decahose_data['timestamp'] = pd.to_datetime(decahose_data['timestamp'])

decahose_data = decahose_data.loc[decahose_data['timestamp'] >= '2021-01-01']

decahose_data = decahose_data.loc[
    ((decahose_data['timestamp'] >= '2021-01-01') & (decahose_data['timestamp'] <= '2021-03-01')) |
    ((decahose_data['timestamp'] >= '2022-03-01') & (decahose_data['timestamp'] <= '2022-05-01')),
:]

with gzip.open('data/paper-round-3/decahose/new-data-processed.csv.gz', 'wt') as f:
    decahose_data.to_csv(f, index=True)

In [None]:
with gzip.open('data/paper-round-3/decahose/new-data-processed.csv.gz', 'rt') as f:
    decahose_data = pd.read_csv(f, index_col='id', parse_dates=['timestamp'])

decahose_data['timestamp'] = decahose_data['timestamp'].dt.tz_localize('utc')

# Radio

In [None]:
with gzip.open('data/paper-round-3/radio/new-data-processed.csv.gz', 'rt') as f:
    radio_data = pd.read_csv(f, index_col='snippet_id', parse_dates=['timestamp'])

# see the 3a notebook for how we got these - they're converted from a Unix
# timestamp and thus are in UTC
radio_data['timestamp'] = radio_data['timestamp'].dt.tz_localize('utc')

# no empty snippets
radio_data = radio_data.loc[~radio_data['content'].isna(), :]

# exclude certain common bad lines up front -- we know they're not good,
# why bother with them later
bad_lines = [
    '[noise]',
    '[noise] [noise]',
    '<unk>',
    '[laughter]',
    'thank you',
    'mm',
    'and',
    'the',
    'thanks',
    'a',
    '[noise] [noise] [noise]',
    'oh',
    'um',
    'i',
    'na',
]

# once again, after dropping such lines, no empty snippets
radio_data = radio_data.loc[~radio_data['content'].isin(bad_lines), :]


radio_data = radio_data.loc[
    ((radio_data['timestamp'] >= '2021-01-01') & (radio_data['timestamp'] <= '2021-03-01')) |
    ((radio_data['timestamp'] >= '2022-03-01') & (radio_data['timestamp'] <= '2022-05-01')),
:]

# Combined rebalanced dataset

In [None]:
def make_reltime(s):
    epoch = pd.Timestamp('1970-01-01T00:00:00+00:00')
    
    s = (s - epoch).dt.total_seconds()
    s -= s.min()
    
    return s

## 2021

In [None]:
ed2021 = elite_data \
    .loc[elite_data['timestamp'] < '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='elite', year=2021)

In [None]:
rd2021 = radio_data \
    .loc[radio_data['timestamp'] < '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='radio', year=2021) \
    .rename({'snippet_id': 'id'}, axis=1)

In [None]:
dd2021 = decahose_data \
    .loc[decahose_data['timestamp'] < '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='decahose', year=2021)

In [None]:
(ed2021.shape[0], dd2021.shape[0], rd2021.shape[0])

In [None]:
size = min(ed2021.shape[0], dd2021.shape[0], rd2021.shape[0])
dd_size = min(6*size, dd2021.shape[0])

ed2021 = ed2021.sample(n=size, replace=False, random_state=seed)
rd2021 = rd2021.sample(n=size, replace=False, random_state=seed)

# there's a lot of irrelevant cruft in here that isn't about news;
# we filter it out after detecting stories and empirically about
# 1/6 of the content is relevant
dd2021 = dd2021.sample(n=dd_size, replace=False, random_state=seed)

In [None]:
dat2021 = pd.concat([ed2021, dd2021, rd2021], axis=0)

dat2021['id'] = dat2021['kind'].map({'elite': 'E', 'radio': 'R', 'decahose': 'D', }) + dat2021['id'].astype(str)
dat2021 = dat2021.loc[~dat2021['id'].duplicated()]
assert dat2021['id'].nunique() == dat2021.shape[0]

dat2021['reltime'] = make_reltime(dat2021['timestamp'])
dat2021 = dat2021.sort_values('reltime')

## 2022

In [None]:
ed2022 = elite_data \
    .loc[elite_data['timestamp'] >= '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='elite', year=2022)

In [None]:
dd2022 = decahose_data \
    .loc[decahose_data['timestamp'] >= '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='decahose', year=2022)

In [None]:
rd2022 = radio_data \
    .loc[radio_data['timestamp'] >= '2022-01-01', ['content', 'timestamp']] \
    .reset_index() \
    .assign(kind='radio', year=2022) \
    .rename({'snippet_id': 'id'}, axis=1)

In [None]:
(ed2022.shape[0], dd2022.shape[0], rd2022.shape[0])

In [None]:
ed_size = ed2022.shape[0]
rd_size = rd2022.shape[0]
dd_size = min(6*ed_size, dd2022.shape[0])

ed2022 = ed2022.sample(n=ed_size, replace=False, random_state=seed)
rd2022 = rd2022.sample(n=rd_size, replace=False, random_state=seed)
dd2022 = dd2022.sample(n=dd_size, replace=False, random_state=seed)

In [None]:
dat2022 = pd.concat([ed2022, dd2022, rd2022], axis=0)

dat2022['id'] = dat2022['kind'].map({'elite': 'E', 'radio': 'R', 'decahose': 'D', }) + dat2022['id'].astype(str)
dat2022 = dat2022.loc[~dat2022['id'].duplicated()]
assert dat2022['id'].nunique() == dat2022.shape[0]

dat2022['reltime'] = make_reltime(dat2022['timestamp'])
dat2022 = dat2022.sort_values('reltime')

## Combine

In [None]:
dat = pd.concat([dat2021, dat2022], axis=0)

assert dat['id'].nunique() == dat.shape[0]

In [None]:
dat = dat.loc[(~dat['content'].isna()) & (dat['content'] != ''), :]

# Inspect

In [None]:
dat['kind'].value_counts()

In [None]:
dat['year'].value_counts()

In [None]:
dat.groupby(['year', 'kind']).size()

In [None]:
dat['reltime'].describe()

In [None]:
dat.groupby('year')['reltime'].describe()

In [None]:
dat.groupby(['year', 'kind'])['reltime'].describe()

In [None]:
with pd.option_context('max_colwidth', None):
    display(dat.sample(10))

# Write out

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-newdata-pre-whisper.csv.gz', 'wt') as f:
    dat.to_csv(f, index=False)