In [None]:
import os
import gzip
import random
import logging

import numpy as np
import pandas as pd

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets-audio-keys.csv.gz', 'rt') as f:
    snippets_2019_2020 = pd.read_csv(f, index_col='snippet_id')

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets-show-station.csv.gz', 'rt') as f:
    snippets_2019_2020_show_station = pd.read_csv(f, index_col='snippet_id')
snippets_2019_2020['show_id'] = snippets_2019_2020_show_station['show_id']
assert snippets_2019_2020['show_id'].notna().all()

In [None]:
with gzip.open('data/paper-round-3/radio/paper-round-3-snippets.csv.gz', 'rt') as f:
    snippets_2019_2020_content = pd.read_csv(f, index_col='snippet_id')
snippets_2019_2020['content'] = snippets_2019_2020_content['content']
assert snippets_2019_2020['content'].notna().all()

In [None]:
snippets_2019_2020['start_dt'] = pd.to_datetime(snippets_2019_2020['start_dt'])
snippets_2019_2020['end_dt'] = pd.to_datetime(snippets_2019_2020['end_dt'])

snippets_2019_2020['start_dt'].dt.year.value_counts()

In [None]:
with gzip.open('data/paper-round-3/radio/new-data-processed.csv.gz', 'rt') as f:
    snippets_2021 = pd.read_csv(f)

In [None]:
snippets_2021['timestamp'] = pd.to_datetime(snippets_2021['timestamp'])
snippets_2021['end_dt'] = pd.to_datetime(snippets_2021['end_dt'])
snippets_2021.drop(snippets_2021.loc[snippets_2021['timestamp'] >= '2022-01-01'].index, axis=0, inplace=True)

snippets_2021['timestamp'].dt.year.value_counts()

# Stats

### Amount of audio

In [None]:
(
    (snippets_2019_2020['end_dt'] - snippets_2019_2020['start_dt']).sum() +
    (snippets_2021['end_dt'] - snippets_2021['timestamp']).sum()
).total_seconds() / 3600

### Number of stations

In [None]:
pd.concat([snippets_2019_2020['audio_key'], snippets_2021['audio_key']]).str.replace('speechbox/stream_out/', '').str.split('/').str[1].nunique()

### Number of shows

In [None]:
pd.concat([snippets_2019_2020['show_id'], snippets_2021['show_id']]).nunique()

### Number of words

In [None]:
pd.concat([snippets_2019_2020['content'], snippets_2021['content']]).apply(lambda s: len(s.split())).sum()