In [None]:
import os
import json
import gzip
import random
import logging

import numpy as np

from tqdm.notebook import tqdm

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 2969591811

random.seed(seed)
np.random.seed(seed)

# Load data

In [None]:
files = ['2019_09', '2019_10', '2020_03', '2020_04', '2021']
counts = [36549587, 35202826, 56103001, 58548815, 32164520]

tmp = {
    'count': 0,
    'duration': 0,
    'wordcount': 0,
    'has_show': 0,
    
    'stations': set(),
    'shows': set(),
}

with tqdm(total=sum(counts)) as pbar:
    for file, count in zip(files, counts):
        path = os.path.join('data/raw/radio', f'{file}.json.gz')
        
        with gzip.open(path, 'rt') as f:
            for line in f:
                line = json.loads(line)
    
                tmp['count'] += 1
                tmp['duration'] += (line['segment_end_global'] - line['segment_start_global'])
                tmp['wordcount'] += len(line['content'].split())
                tmp['stations'].add(line['callsign'])
    
                if 'show_name' in line.keys():
                    tmp['has_show'] += 1
                    tmp['shows'].add(line['show_name'])

                pbar.update(1)

tmp['stations'] = len(set(tmp['stations']))
tmp['shows'] = len(set(tmp['shows']))
tmp['show_frac'] = tmp['has_show'] / tmp['count']

# Stats

In [None]:
tmp