In [None]:
import os
import re
import json
import gzip
import random
import html
import string
import pickle
import logging

import numpy as np
import pandas as pd

import unidecode

from nltk.corpus import wordnet as wn
from num2words import num2words

from IPython.display import display, Audio
from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger('boto3').setLevel(logging.WARNING)
logging.getLogger('botocore').setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-pre-whisper.csv.gz', 'rt') as f:
    full_sample_old = pd.read_csv(f)

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-newdata-pre-whisper.csv.gz', 'rt') as f:
    full_sample_new = pd.read_csv(f)

In [None]:
full_sample = pd.concat([full_sample_old, full_sample_new], axis=0).reset_index(drop=True)
del full_sample_old, full_sample_new

In [None]:
print(f"Dropped {(~full_sample['kind'].isin(['decahose', 'radio', 'elite'])).sum()} misparsed decahose rows")

# drop a small number of decahose examples that had weird content + didn't save correctly
full_sample = full_sample.loc[full_sample['kind'].isin(['decahose', 'radio', 'elite']), :]

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample-whisper-transcripts.csv.gz', 'rt') as f:
    whisper_transcripts = pd.read_csv(f)

In [None]:
print(f'Total rows {whisper_transcripts.shape[0]}')

def check_int(s):
    try:
        int(s)
    except ValueError:
        return False
    else:
        return True

is_int = whisper_transcripts['snippet_id'].apply(check_int)

# we have one weird badly parsed row
print(f'Dropped {whisper_transcripts.shape[0] - is_int.sum()} misparsed rows')

whisper_transcripts = whisper_transcripts.loc[is_int, :]
whisper_transcripts['snippet_id'] = whisper_transcripts['snippet_id'].astype(int)

print(f"Dropped {whisper_transcripts['content'].isna().sum()} rows with no ASR'd text")
whisper_transcripts = whisper_transcripts.loc[~whisper_transcripts['content'].isna(), :]

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets-audio-keys.csv.gz', 'rt') as f:
    audio_old = pd.read_csv(f)

In [None]:
with gzip.open('data/paper-round-3/radio/new-data-processed.csv.gz', 'rt') as f:
    audio_new = pd.read_csv(f)

In [None]:
audio_new = audio_new \
    [['snippet_id', 'audio_key', 'audio_file_offset',
      'audio_file_index', 'timestamp', 'end_dt', 'duration']] \
    .rename({'timestamp': 'start_dt'}, axis=1)

audio_new['start_dt'] = pd.to_datetime(audio_new['start_dt']).dt.tz_localize('utc')
audio_new['end_dt'] = pd.to_datetime(audio_new['end_dt']).dt.tz_localize('utc')

In [None]:
audio = pd.concat([audio_old, audio_new], axis=0)
del audio_old, audio_new

In [None]:
radio = full_sample.loc[full_sample['kind'] == 'radio', :].copy()
radio['id'] = radio['id'].str[1:].astype(int)
radio = radio.rename({'id': 'snippet_id'}, axis=1)

radio = radio.merge(whisper_transcripts.rename({'content': 'whisper_content'}, axis=1),
                    on='snippet_id', how='left')

# Inspect new transcripts

## Stats

In [None]:
full_sample['kind'].value_counts()

In [None]:
full_sample['id'].str[0].value_counts()

In [None]:
radio['whisper_content'].isna().value_counts()

## Where did Whisper recognize no speech?

In [None]:
radio.loc[radio['whisper_content'].isna(), :].sample(10)

## Compare recognition outputs

In [None]:
with pd.option_context('display.max_colwidth', None):
    display(radio[['snippet_id', 'content', 'whisper_content']].sample(10))

## Listen to some audio

In [None]:
# import transcribe as tr

# def play_snippet(snippet_id):
#     props = audio.loc[
#         audio['snippet_id'] == snippet_id,
#         ['audio_key', 'audio_file_offset', 'duration']
#     ]
#     assert props.shape[0] == 1
#     props = props.iloc[0].to_dict()
#     print(props)
    
#     return tr.play_s3_audio(
#         bucket='cortico-data',
#         key=props['audio_key'],
#         start_time=props['audio_file_offset'],
#         duration=props['duration'],
#         aws_profile='cortico',
#     )

In [None]:
# play_snippet(92087140)

# More formatting

In [None]:
# !mv data/paper-round-3/event-annotated/auto-sample.csv.gz data/paper-round-3/event-annotated/auto-sample-old-version-before-new-data.csv.gz

In [None]:
full_sample.loc[full_sample['kind'] == 'radio', 'snippet_id'] = full_sample['id'].str[1:].astype(int)
full_sample = full_sample.merge(whisper_transcripts.rename({'content': 'whisper_content'}, axis=1),
                    on='snippet_id', how='left')

full_sample['has_whisper'] = (~full_sample['whisper_content'].isna()).astype(int)
full_sample['content'] = full_sample['whisper_content'].combine_first(full_sample['content'])

full_sample = full_sample.drop(['whisper_content', 'snippet_id'], axis=1)

In [None]:
# full_sample = full_sample.loc[full_sample['year'] != 2022]

# Shrink the decahose

We oversampled it earlier but this may not really work; too many edges makes for running out of memory.

In [None]:
# sizes = full_sample \
#     .loc[full_sample['kind'] != 'decahose'] \
#     .groupby(['year', 'kind']) \
#     .size() \
#     .reset_index() \
#     .groupby('year') \
#     [0].max() \
#     .reset_index()

# sizes['year'] = sizes['year'].astype(int)

# sizes = sizes.set_index('year')[0]
# sizes = dict(zip(sizes.index, sizes.tolist()))

# sizes

In [None]:
# full_sample = pd.concat([
#     full_sample.loc[full_sample['kind'] != 'decahose'],
    
#     full_sample.loc[(full_sample['kind'] == 'decahose') & (full_sample['year'] == 2019)].sample(sizes[2019]),
#     full_sample.loc[(full_sample['kind'] == 'decahose') & (full_sample['year'] == 2020)].sample(sizes[2020]),
#     full_sample.loc[(full_sample['kind'] == 'decahose') & (full_sample['year'] == 2021)].sample(sizes[2021]),
#     # full_sample.loc[(full_sample['kind'] == 'decahose') & (full_sample['year'] == 2022)].sample(sizes[2022]),
# ], axis=0)

In [None]:
# full_sample.groupby(['year', 'kind']).size()

# Prep tweet content

In [None]:
raw_twitter_texts = []
with gzip.open('data/paper-round-3/twitter/thesis.jsonl.gz', 'rt') as f:
    for line in tqdm(f):
        line = json.loads(line)
        
        line_id = line['id']
        line_content = line['full_text']
        # if 'quoted_status' in line.keys():
        #     line_content += ' ' + line['quoted_status']['full_text']
        
        raw_twitter_texts += [{
            'id': line_id,
            'raw_twitter_content': line_content,
        }]

raw_twitter_texts = pd.DataFrame(raw_twitter_texts)
assert raw_twitter_texts['id'].nunique() == raw_twitter_texts.shape[0]

In [None]:
full_sample['id_num'] = full_sample['id'].apply(lambda s: int(s[1:]))
full_sample = full_sample.merge(raw_twitter_texts, left_on='id_num', right_on='id', how='left')
full_sample.drop(['id_num', 'id_y'], axis=1, inplace=True)
full_sample.rename({'id_x': 'id'}, axis=1, inplace=True)

replace_mask = (full_sample['kind'] == 'elite') & full_sample['year'].isin([2019, 2020])
full_sample.loc[replace_mask, 'content'] = full_sample.loc[replace_mask, 'raw_twitter_content'] \
    .str.replace('\n', ' ') \
    .str.replace('\r', ' ') \
    .str.replace('\t', ' ')

full_sample.drop('raw_twitter_content', axis=1, inplace=True)

# Write out combined data

In [None]:
with gzip.open('data/paper-round-3/event-annotated/auto-sample.csv.gz', 'wt') as f:
    full_sample.to_csv(f, index=False)