In [1]:
import os, sys
TOP_DIR = os.path.realpath('../../..')
if TOP_DIR not in sys.path: sys.path.append(TOP_DIR)
import pandas as pd

In [2]:
from lib.util.convert import named_cumulative_sum

In [3]:
data = pd.read_csv('../../../data/metrics/community/events.csv', parse_dates=['start_date'])
data.ward_code = data.ward_code.fillna('UNKNOWN')

Set up output directories

In [4]:
OUT_DIR = '../../../docs/metrics/community/_data'
os.makedirs(OUT_DIR, exist_ok=True)

In [5]:
pd.Series({
  'events': data.event_name.count(),
  'audience': data.audience.sum().astype(int),
  'errors': {
    'missing_wards': (data.ward_code == 'UNKNOWN').value_counts()[True],
    'missing_audience': data.audience.isna().value_counts()[True],
    'zero_audience':  (data.audience < 1).value_counts()[True],  
  }
}).to_json(os.path.join(OUT_DIR, 'headlines.json'), indent=2)

In [6]:
by_ward = pd.DataFrame({
  'events': data.groupby('ward_code').event_name.count(),
  'audience': data.groupby('ward_code').audience.sum(),
}).to_csv(os.path.join(OUT_DIR, 'by_ward.csv'))

In [7]:
by_date = pd.DataFrame({
    'events': data.groupby('start_date').event_name.count(),
    'audience': data.groupby('start_date').audience.sum(),
}).resample('M').sum()

pd.concat([
    by_date,
    by_date.pipe(named_cumulative_sum)
], axis=1).astype('int').to_csv(os.path.join(OUT_DIR, 'by_date.csv'))