In [None]:
import os
import re
import csv
import json
import gzip
import random
import logging

# import psycopg2
import numpy as np
import pandas as pd

from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
os.makedirs('data/paper-round-3/event-annotated/', exist_ok=True)
os.makedirs('data/paper-round-3/metadata/', exist_ok=True)

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Define event keywords

In [None]:
def matches_bolton_fired(s):
    values = ['bolton', 'john_bolton']
    return len([w for w in s.lower().strip().split() if w in values])

def matches_manning_released(s):
    values = [
        'manning',
        'chelsea_manning', 'chelsea manning',
        'bradley_manning', 'bradley_manning'
    ]
    
    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

def matches_tom_brady_free_agent(s):
    values = [
        'tom_brady', 'tom brady',
        'free_agent', 'free agent'
    ]
    
    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

def matches_shane_gillis_fired(s):
    values = [
        'shane_gillis', 'shane gillis',
        'saturday_night_live', 'saturday night live',
        'sheen gillis'  # misrecognition in radio data
    ]
    
    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

def matches_purdue_bankruptcy(s):
    values = [
        'purdue', 'purdue_pharma', 'purdue pharma'
    ]
    
    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

def matches_huffman_sentencing(s):
    values = [
        'felicity huffman', 'felicity_huffman',
        'varsity blues', 'varsity_blues',
        'felicityhuffman',
    ]

    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

def matches_bernie_drops_out(s):
    values = [
        'sanders',
        'drop out',
        'drops out',
        'dropped out',
    ]

    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1

    return cnt

def matches_nba_season_cancelled(s):
    values = [
        'nba',
        'n._b._a.',  # radio data format
        'adam silver',
        'adam_silver',
    ]

    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    if re.search('suspending.*season', s):
        cnt += 1
        
    if re.search('suspended.*season', s):
        cnt += 1
    
    if re.search('season.*suspended', s):
        cnt += 1
        
    return cnt

def matches_warren_drops_out(s):
    values = [
        'warren',
        'drop out',
        'drops out',
        'dropped out',
    ]

    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1

    return cnt

def matches_trump_impeachment(s):
    values = [
        'impeachment inquiry',
        'impeachment investigation',
    ]

    s = s.lower().strip()

    cnt = 0
    for v in values:
        if v in s:
            cnt += 1
    
    return cnt

In [None]:
events = {
    # exact timestamps are for the start time of the 15 minute period during which the story breaks
    
    # 9/10: John Bolton fired
    'john_bolton_fired': {'timestamp': '2019-09-10 16:00:00+00:00', 'func': matches_bolton_fired},
    
    # note that ~3am 9/16 UTC is ~11pm 9/15 EDT
    # 9/15: Purdue Pharma files for bankruptcy
    'purdue_bankruptcy': {'timestamp': '2019-09-16 03:15:00+00:00', 'func': matches_purdue_bankruptcy},
    
    # 3/17: Tom Brady goes free agent
    'tom_brady_free_agent': {'timestamp': '2020-03-17 12:45:00+00:00', 'func': matches_tom_brady_free_agent},
    
    # 9/16: Shane Gillis, comedian, fired from SNL
    'shane_gillis_fired': {'timestamp': '2019-09-16 20:00:00+00:00', 'func': matches_shane_gillis_fired},
    
    # 3/12: Chelsea Manning ordered released from prison
    'manning_released': {'timestamp': '2020-03-12 21:15:00+00:00', 'func': matches_manning_released},

    # Twitter starts after previous radio discussion, because it was known in
    # advance she'd be sentenced that day
    # 9/13: Felicity Huffman sentenced for college admissions bribery
    'huffman_sentencing': {'timestamp': '2019-09-13 18:00:00+00:00', 'func': matches_huffman_sentencing},
    
    # 4/8: Bernie Sanders drops out, Joe Biden becomes presumptive Dem nominee for prez
    'bernie_drops_out': {'timestamp': '2020-04-08 15:15:00+00:00', 'func': matches_bernie_drops_out},
    
    # note that 1:30am 3/12 UTC is 9:30pm 3/11 EDT
    # 3/11: NBA season canceled for Covid
    'nba_season_cancelled': {'timestamp': '2020-03-12 01:30:00+00:00', 'func': matches_nba_season_cancelled},
    
    # 3/5: Elizabeth Warren drops out
    'warren_drops_out': {'timestamp': '2020-03-05 15:30:00+00:00', 'func': matches_warren_drops_out},

    # the timestamp here is when the news she definitely would announce an impeachment
    # inquiry first broke; there had been a long buildup all day of Dems announcing they
    # supported one
    # 9/24: Pelosi announces impeachment inquiry into Trump
    'trump_impeachment': {'timestamp': '2019-09-24 18:30:00+00:00', 'func': matches_trump_impeachment},
}

In [None]:
pd.DataFrame(events).T \
    .reset_index() \
    .drop('func', axis=1) \
    .rename({'index': 'event'}, axis=1) \
    .to_csv('data/paper-round-3/metadata/event-terms.csv', index=False)

# Phrase detection

To match radio data format

# Utils

In [None]:
event_cols = ['event_' + c for c in events.keys()]

In [None]:
def find_matches(data, events):
    event_matches = [
        data['content'].apply(params['func']) \
            .astype(int) \
            .rename('event_' + event)
        for event, params in tqdm(events.items())
    ]
    
    return pd.concat(event_matches, axis=1)

In [None]:
def make_ticks(event_matches, freqs=['15min'], by=[], cols=event_cols):
    tick_dfs = []
    
    for freq in tqdm(freqs):
        grp = pd.Grouper(key='timestamp', freq=freq)

        ticks_all = event_matches.groupby(grp)[cols].sum()
        ticks_all['total'] = event_matches.groupby(grp).size()
        ticks_all['freq'] = freq
        
        for b in tqdm(by):
            ticks_all[b] = np.nan  # make them concat-able
            
            ticks_by = event_matches.groupby([b, grp])[cols].sum()
            ticks_by['total'] = event_matches.groupby([b, grp]).size()
            ticks_by['freq'] = freq
        
            tick_dfs += [ticks_by.reset_index()]

        tick_dfs += [ticks_all.reset_index()]
        
    return pd.concat(tick_dfs, axis=0)

# Elite Twitter

In [None]:
ci = pd.read_csv('data/twitter/community-ideology.csv')
ci = ci.loc[~ci['follow_community'].isna(), ['user_id', 'follow_community']]

In [None]:
total = 11_519_042  # !zcat data/paper-round-3/twitter/thesis.jsonl.gz | wc -l

elite_data = []
with gzip.open('data/paper-round-3/twitter/thesis.jsonl.gz', 'rt') as f:
    for line in tqdm(f, total=total):
        elite_data += [json.loads(line)]
elite_data = pd.DataFrame(elite_data)

elite_data.set_index('id', inplace=True)
elite_data = elite_data.loc[~elite_data.index.duplicated(), :]

elite_data.rename({'full_text': 'content'}, axis=1, inplace=True)
elite_data.rename({'created_at': 'timestamp'}, axis=1, inplace=True)
elite_data['timestamp'] = pd.to_datetime(elite_data['timestamp'])

elite_data['user_id'] = elite_data['user'].apply(lambda s: s['id'])
elite_data['username'] = elite_data['user'].apply(lambda s: s['screen_name'])
elite_data['friends_count'] = elite_data['user'].apply(lambda s: s['friends_count'])
elite_data['followers_count'] = elite_data['user'].apply(lambda s: s['followers_count'])
elite_data['statuses_count'] = elite_data['user'].apply(lambda s: s['statuses_count'])
elite_data['is_retweet'] = elite_data['retweeted_status'].notna()

elite_data = elite_data.loc[
    ((elite_data['timestamp'] >= '2019-09-01') & (elite_data['timestamp'] <= '2019-11-01')) |
    ((elite_data['timestamp'] >= '2020-03-01') & (elite_data['timestamp'] <= '2020-05-01'))
]

elite_data = elite_data[['timestamp', 'user_id', 'username', 'friends_count', 'followers_count', 'statuses_count', 'content', 'is_retweet']]

In [None]:
elite_data = elite_data.reset_index()
elite_data = elite_data.merge(ci, how='left', on='user_id')
elite_data['conservative'] = (elite_data['follow_community'] == 3).astype(int)
elite_data.set_index('id', inplace=True)

In [None]:
elite_event_matches = find_matches(elite_data, events)
elite_event_matches['timestamp'] = elite_data['timestamp']
elite_event_matches['is_retweet'] = elite_data['is_retweet']
elite_event_matches['conservative'] = elite_data['conservative']

elite_event_matches.to_csv('data/paper-round-3/event-annotated/raw-elite.csv', index=True)

del elite_data

In [None]:
elite_ticks = make_ticks(
    elite_event_matches,
    freqs=['15min', '6H', '1D'],
    by=['is_retweet', 'conservative'],
    cols=event_cols
)

elite_ticks = elite_ticks.loc[
    ((elite_ticks['timestamp'] >= '2019-09-01') & (elite_ticks['timestamp'] <= '2019-11-01')) |
    ((elite_ticks['timestamp'] >= '2020-03-01') & (elite_ticks['timestamp'] <= '2020-05-01')),
:]

elite_ticks.to_csv('data/paper-round-3/event-annotated/ticks-elite.csv', index=False)

del elite_event_matches, elite_ticks

# Decahose

In [None]:
total = 317_774_406  # zcat decahose-10pct-random-sort-20230509.csv.gz | grep '^"[0-9]\+"' | wc -l
take = (np.random.random((total,)) <= 0.057)  # ~18m rows

decahose_data = []
with gzip.open('data/paper-round-3/decahose/decahose-10pct-random-sort-20230509.csv.gz', 'rt') as f:
    reader = csv.DictReader(f)
    for i, row in tqdm(enumerate(reader), total=total):
        if take[i]:
            decahose_data += [row]
decahose_data = pd.DataFrame(decahose_data)

decahose_data.set_index('id', inplace=True)
decahose_data = decahose_data.loc[~decahose_data.index.duplicated(), :]

decahose_data.rename({'postedtime': 'timestamp', 'body': 'content'}, axis=1, inplace=True)
decahose_data['timestamp'] = pd.to_datetime(decahose_data['timestamp'])

decahose_data = decahose_data.loc[
    ((decahose_data['timestamp'] >= '2019-09-01') & (decahose_data['timestamp'] < '2019-11-01')) |
    ((decahose_data['timestamp'] >= '2020-03-01') & (decahose_data['timestamp'] <= '2020-05-01')),
:]

decahose_data['is_retweet'] = (decahose_data['verb'] == 'share')
decahose_data['timestamp'] = decahose_data['timestamp'].dt.tz_localize('utc')

decahose_data = decahose_data[['timestamp', 'user_id', 'username', 'friends_count', 'followers_count', 'statuses_count', 'content', 'is_retweet']]

In [None]:
decahose_event_matches = find_matches(decahose_data, events)
decahose_event_matches['timestamp'] = decahose_data['timestamp']
decahose_event_matches['is_retweet'] = decahose_data['is_retweet']

decahose_event_matches.to_csv('data/paper-round-3/event-annotated/raw-decahose.csv', index=True)

del decahose_data

In [None]:
decahose_ticks = make_ticks(
    decahose_event_matches,
    freqs=['15min', '6H', '1D'],
    by=['is_retweet'],
    cols=event_cols
)

decahose_ticks = decahose_ticks.loc[
    ((decahose_ticks['timestamp'] >= '2019-09-01') & (decahose_ticks['timestamp'] <= '2019-11-01')) |
    ((decahose_ticks['timestamp'] >= '2020-03-01') & (decahose_ticks['timestamp'] <= '2020-05-01')),
:]

decahose_ticks.to_csv('data/paper-round-3/event-annotated/ticks-decahose.csv', index=False)

del decahose_event_matches, decahose_ticks

## Radio

In [None]:
# %%time

# with psycopg2.connect(host='localhost', dbname='thesis') as conn:
#     cur = conn.cursor()
    
#     cur.execute('''
#     drop table if exists datasets.paper_round_3_snippets;
#     create table datasets.paper_round_3_snippets as
#     select
#         sn.snippet_id,
        
#         sn.start_dt as timestamp,
        
#         case ds.census_region_5way
#             when 'South' then 'S'
#             when 'Northeast' then 'N'
#             when 'West' then 'W'
#             when 'Midwest' then 'M'
#             when 'Pacific' then 'P'
#             else null
#         end as station_census_region,
#         (st.format = 'Public Radio')::int as is_public,
#         (st.band = 'AM')::int as am_band,
#         ss.syndicated::int as syndicated,
        
#         sn.content
#     from radio.snippet sn
#         inner join datasets.radio_best_episode_confidence_only be on
#             be.date = sn.date and
#             be.show_id = sn.show_id and
#             be.station_id = sn.station_id
            
#         inner join radio.station st on st.station_id = sn.station_id
#         inner join dim.state ds on ds.state_id = st.state_id
#         left join radio.show sh on sh.show_id = sn.show_id
        
#         left join
#         (
#             select
#                 ssi.snippet_id,
#                 ssi.syndicated
#             from radio.snippet_syndication ssi
#             where
#                 ssi.syndication_method = 'show'
#         ) ss on ss.snippet_id = sn.snippet_id
#     where
#         not st.exclude and
#         not coalesce(sh.exclude, false) and
#         sn.content <> '';
#     ''')

In [None]:
%%time

with gzip.open('data/paper-round-3/radio/paper-round-3-snippets.csv.gz', 'rt') as f:
    radio_data = pd.read_csv(f, index_col='snippet_id', parse_dates=['timestamp'])

## Use this to run only on the whisper-transcribed subset, where the results are very similar
# with gzip.open('data/paper-round-3/event-annotated/auto-sample-whisper-transcripts.csv.gz', 'rt') as f:
#     whisper_transcripts = pd.read_csv(f)

# def check_int(s):
#     try:
#         int(s)
#     except ValueError:
#         return False
#     else:
#         return True

# is_int = whisper_transcripts['snippet_id'].apply(check_int)

# # there's one bad row where whisper generated some punctuation
# # that breaks the csv format; let's ignore it
# assert whisper_transcripts.shape[0] - is_int.sum() == 1

# whisper_transcripts = whisper_transcripts.loc[is_int, :]
# whisper_transcripts['snippet_id'] = whisper_transcripts['snippet_id'].astype(int)
# whisper_transcripts = whisper_transcripts.set_index('snippet_id')

# assert whisper_transcripts.shape[0] - whisper_transcripts.index.isin(radio_data.index).sum() == 0

# radio_data['whisper_content'] = whisper_transcripts['content']
# radio_data = radio_data.loc[radio_data['whisper_content'].notna(), :]
# radio_data = radio_data.drop('content', axis=1).rename({'whisper_content': 'content'}, axis=1)

In [None]:
%%time

radio_event_matches = find_matches(radio_data, events)

radio_event_matches['timestamp'] = radio_data['timestamp']
for by in ['is_public', 'station_census_region', 'am_band', 'syndicated']:
    radio_event_matches[by] = radio_data[by]

radio_event_matches.to_csv('data/paper-round-3/event-annotated/raw-radio.csv', index=True)

del radio_data

In [None]:
%%time

radio_ticks = make_ticks(
    radio_event_matches,
    freqs=['15min', '6H', '1D'],
    by=['is_public', 'station_census_region', 'am_band', 'syndicated'],
    cols=event_cols
)

radio_ticks = radio_ticks.loc[
    ((radio_ticks['timestamp'] >= '2019-09-01') & (radio_ticks['timestamp'] <= '2019-11-01')) |
    ((radio_ticks['timestamp'] >= '2020-03-01') & (radio_ticks['timestamp'] <= '2020-05-01'))
]

radio_ticks.to_csv('data/paper-round-3/event-annotated/ticks-radio.csv', index=False)

del radio_event_matches, radio_ticks