In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

from statsmodels.distributions.empirical_distribution import ECDF

from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'rt') as f:
    dat = pd.read_csv(f, parse_dates=['timestamp'], index_col='id')

assert dat.index.is_unique

dat.shape

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-communities-merged-pre-filter.csv.gz', 'rt') as f:
    comms = pd.read_csv(f, index_col='id')

assert comms.index.is_unique

comms.shape

In [None]:
dat['group'] = comms['group']
dat = dat.loc[dat['group'].notna(), :]

dat['group'] = dat['group'].astype(int)
dat['year'] = dat['year'].astype(int)

dat['content_len'] = dat['content'].str.len()
dat['word_len'] = dat['content'].apply(lambda s: len(s.replace('_', '').split()))

def covid_words(s):
    query_words = ['covid', 'coronavirus', 'corona', 'pandemic', 'virus', 'masks', 'lockdown', 'lockdowns']    
    return sum([w in query_words for w in s.replace('_', '').split()])
dat['covid_word_len'] = dat['content'].apply(covid_words)

In [None]:
del comms

## Liberal/conservative status

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets.csv.gz', 'rt') as f:
    radio_data_old = pd.read_csv(f, parse_dates=['timestamp'])
    radio_data_old['snippet_id'] = 'R' + radio_data_old['snippet_id'].astype(str)

with gzip.open('data/paper-round-3/radio/new-data-processed.csv.gz', 'rt') as f:
    radio_data_new = pd.read_csv(f, parse_dates=['timestamp'])
    radio_data_new['snippet_id'] = 'R' + radio_data_new['snippet_id'].astype(str)

radio_data = pd.concat([
    radio_data_old[['snippet_id', 'is_public']],
    radio_data_new[['snippet_id', 'is_public']],
], axis=0).set_index('snippet_id')

del radio_data_old, radio_data_new

In [None]:
elite_data_old = pd.read_csv('data/twitter/tweets-processed.csv', parse_dates=['timestamp'])

with gzip.open('data/paper-round-3/twitter/new-data-processed.jsonl.gz', 'rt') as f:
    elite_data_new = pd.read_json(f, lines=True)
elite_data_new['timestamp'] = pd.to_datetime(elite_data_new['timestamp'])
    
elite_data = pd.concat([
    elite_data_old[['id', 'user_id']],
    elite_data_new[['id', 'user_id']],
], axis=0)

elite_data['id'] = 'E' + elite_data['id'].astype(str)

del elite_data_old, elite_data_new

In [None]:
ci = pd.read_csv('data/twitter/community-ideology.csv')
ci = ci.loc[~ci['follow_community'].isna(), ['user_id', 'follow_community']]
elite_data = elite_data.merge(ci, how='left', on='user_id')
elite_data['conservative'] = (elite_data['follow_community'] == 3).astype(int)

In [None]:
sample_radio_ids = dat.loc[dat['kind'] == 'radio', :].index
sample_elite_ids = dat.loc[dat['kind'] == 'elite', :].index

In [None]:
ideology_status = pd.concat([
    radio_data \
        .loc[radio_data.index.isin(sample_radio_ids), 'is_public'] \
        .apply(lambda s: 1 if s == 0 else 0) \
        .reset_index() \
        .rename({'snippet_id': 'id', 'is_public': 'conservative'}, axis=1) \
        .assign(kind='radio'),

    elite_data \
            .loc[elite_data['id'].isin(sample_elite_ids), ['id', 'conservative']] \
            .assign(kind='elite')
], axis=0)

assert ideology_status['conservative'].isna().sum() == 0

ideology_status.groupby('kind')['conservative'].mean()

In [None]:
dat = dat.merge(ideology_status[['id', 'conservative']], how='left', on='id')

In [None]:
assert dat.loc[dat['kind'].isin(['radio', 'elite']), 'conservative'].isna().sum() == 0

# Story stats

## Calculate

In [None]:
cdf_query_end = 7 * 24 * 3600
cdf_query_inc = 60  # one minute; afterward we can sample to a higher frequency if desired
n_cdf_pts = int(cdf_query_end / cdf_query_inc)

stats, stats_lib, stats_con = [], [], []
cdf_vals, cdf_vals_lib, cdf_vals_con = [], [], []
for year in tqdm(dat['year'].unique()):
    for kind in tqdm(dat['kind'].unique()):
        for c in tqdm(dat.loc[(dat['kind'] == kind) & (dat['year'] == year), 'group'].unique()):
            tmp = dat.loc[(dat['kind'] == kind) & (dat['year'] == year) & (dat['group'] == c), :]            
            assert tmp.shape[0] > 0

            tmp_lib = tmp.loc[tmp['conservative'] == 0, :]
            tmp_con = tmp.loc[tmp['conservative'] == 1, :]
            
            stats += [{
                'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                'year': int(year),
                'kind': kind,
                'group': c,
                'conservative': np.nan,

                'count': tmp['reltime'].shape[0],

                'start': tmp['reltime'].min(),
                'end': tmp['reltime'].max(),
                'dur': tmp['reltime'].max() - tmp['reltime'].min(),
                'iqr': np.percentile(tmp['reltime'], 75) - np.percentile(tmp['reltime'], 25),
                'std': tmp['reltime'].std(),
                'avg': (tmp['reltime'] - tmp['reltime'].min()).mean(),
                'avg_abs': np.nan,

                'covid_frac': tmp['covid_word_len'].sum() / tmp['word_len'].sum(),
                'word_len_mean': tmp['word_len'].mean(),
                'word_len_std': tmp['word_len'].std(),
                'text_len_mean': tmp['content_len'].mean(),
                'text_len_std': tmp['content_len'].std(),
            }]

            cdf_input_pts = tmp['reltime']
            cdf = ECDF(cdf_input_pts)
            query_pts = np.arange(
                cdf_input_pts.min(),
                cdf_input_pts.min() + cdf_query_end,            
                cdf_query_inc
            )
            cdf_vals += [cdf(query_pts)[0:n_cdf_pts]]

            if tmp_lib.shape[0] > 0:
                stats_lib += [{
                    'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                    'year': int(year),
                    'kind': kind,
                    'group': c,
                    'conservative': 0,

                    'count': tmp_lib['reltime'].shape[0],

                    'start': tmp_lib['reltime'].min(),
                    'end': tmp_lib['reltime'].max(),
                    'dur': tmp_lib['reltime'].max() - tmp_lib['reltime'].min(),
                    'iqr': np.percentile(tmp_lib['reltime'], 75) - np.percentile(tmp_lib['reltime'], 25),
                    'std': tmp_lib['reltime'].std(),
                    'avg': (tmp_lib['reltime'] - tmp_lib['reltime'].min()).mean(),
                    'avg_abs': (tmp_lib['reltime'] - tmp['reltime'].min()).mean(),

                    'covid_frac': tmp_lib['covid_word_len'].sum() / tmp_lib['word_len'].sum(),
                    'word_len_mean': tmp_lib['word_len'].mean(),
                    'word_len_std': tmp_lib['word_len'].std(),
                    'text_len_mean': tmp_lib['content_len'].mean(),
                    'text_len_std': tmp_lib['content_len'].std(),
                }]
                
                cdf_input_pts = tmp_lib['reltime']
                cdf = ECDF(cdf_input_pts)
                query_pts = np.arange(
                    cdf_input_pts.min(),
                    cdf_input_pts.min() + cdf_query_end,            
                    cdf_query_inc
                )
                cdf_vals_lib += [cdf(query_pts)[0:n_cdf_pts]]
            else:
                stats_lib += [{
                    'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                    'year': int(year),
                    'kind': kind,
                    'group': c,
                    'conservative': 0,

                    'count': 0,

                    'start': np.nan,
                    'end': np.nan,
                    'dur': np.nan,
                    'iqr': np.nan,
                    'std': np.nan,
                    'avg': np.nan,
                    'avg_abs': np.nan,

                    'covid_frac': np.nan,
                    'word_len_mean': np.nan,
                    'word_len_std': np.nan,
                    'text_len_mean': np.nan,
                    'text_len_std': np.nan,
                }]
                
                vals = np.empty((n_cdf_pts,))
                vals[:] = np.nan
                cdf_vals_lib += [vals]

            if tmp_con.shape[0] > 0:
                stats_con += [{
                    'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                    'year': int(year),
                    'kind': kind,
                    'group': c,
                    'conservative': 1,

                    'count': tmp_con['reltime'].shape[0],

                    'start': tmp_con['reltime'].min(),
                    'end': tmp_con['reltime'].max(),
                    'dur': tmp_con['reltime'].max() - tmp_con['reltime'].min(),
                    'iqr': np.percentile(tmp_con['reltime'], 75) - np.percentile(tmp_con['reltime'], 25),
                    'std': tmp_con['reltime'].std(),
                    'avg': (tmp_con['reltime'] - tmp_con['reltime'].min()).mean(),
                    'avg_abs': (tmp_con['reltime'] - tmp['reltime'].min()).mean(),

                    'covid_frac': tmp_con['covid_word_len'].sum() / tmp_con['word_len'].sum(),
                    'word_len_mean': tmp_con['word_len'].mean(),
                    'word_len_std': tmp_con['word_len'].std(),
                    'text_len_mean': tmp_con['content_len'].mean(),
                    'text_len_std': tmp_con['content_len'].std(),
                }]

                #
                # CDFs
                #

                cdf_input_pts = tmp_con['reltime']
                cdf = ECDF(cdf_input_pts)
                query_pts = np.arange(
                    cdf_input_pts.min(),
                    cdf_input_pts.min() + cdf_query_end,            
                    cdf_query_inc
                )
                cdf_vals_con += [cdf(query_pts)[0:n_cdf_pts]]
            else:
                stats_con += [{
                    'story_id': str(int(year)) + '-' + kind + '-' + str(c),
                    'year': int(year),
                    'kind': kind,
                    'group': c,
                    'conservative': 1,

                    'count': 0,

                    'start': np.nan,
                    'end': np.nan,
                    'dur': np.nan,
                    'iqr': np.nan,
                    'std': np.nan,
                    'avg': np.nan,
                    'avg_abs': np.nan,

                    'covid_frac': np.nan,
                    'word_len_mean': np.nan,
                    'word_len_std': np.nan,
                    'text_len_mean': np.nan,
                    'text_len_std': np.nan,
                }]
                
                vals = np.empty((n_cdf_pts,))
                vals[:] = np.nan
                cdf_vals_con += [vals]

stats = pd.DataFrame(stats)
stats_lib = pd.DataFrame(stats_lib)
stats_con = pd.DataFrame(stats_con)

cdf_vals = np.stack(cdf_vals)
cdf_vals_lib = np.stack(cdf_vals_lib)
cdf_vals_con = np.stack(cdf_vals_con)

# Write out stats + data

In [None]:
stats.to_csv('data/paper-round-3/event-annotated/auto-story-stats.csv', index=False)

In [None]:
stats_lib.to_csv('data/paper-round-3/event-annotated/auto-story-stats-lib.csv', index=False)

In [None]:
stats_con.to_csv('data/paper-round-3/event-annotated/auto-story-stats-con.csv', index=False)

In [None]:
ideology_status.to_csv('data/paper-round-3/event-annotated/auto-ideology-status.csv', index=False)

In [None]:
with open('data/paper-round-3/event-annotated/auto-story-cdfs.npy', 'wb') as f:
    np.save(f, cdf_vals)

with open('data/paper-round-3/event-annotated/auto-story-cdfs-lib.npy', 'wb') as f:
    np.save(f, cdf_vals_lib)

with open('data/paper-round-3/event-annotated/auto-story-cdfs-con.npy', 'wb') as f:
    np.save(f, cdf_vals_con)