# Programme data

This notebook prepares data for the Programme theme page.

In [None]:
import json
from datetime import date

from utils.themes.programme import Programme
import petl as etl

In [None]:
from utils.paths import SITE

EVENTS = SITE / 'themes/programme/_data/events'
EVENTS.mkdir(exist_ok=True, parents=True)

## Schedule events

Aggregation definition

In [None]:
event_aggregation = {
    'Records': ('row', set),
    'Events': ('Event Count', sum),
    'Date From': ('Start Date', min),
    'Date To': ('End Date', max),
}

Read events

In [None]:
import petl as etl
import pyarrow.parquet as pq
from utils.paths import PUBLISHED

def validation(row):
    if row.project_name is None:
        return 'unknown_project'
    if row.month is None:
        return 'blank_month'
    if row.date > date.today():
        return 'future_dated'
    if row.date < date.fromisoformat('2024-01-01'):
        return 'date_before_2024'
    return None

canonical_project_name = {
    'Rise (AKA - Opening Event)': 'RISE',
    # 'Our Patch (formerly Magic Waiting)': 'Our Patch',
    'Our Patch (formerly Magic Waiting) MASTER': 'Our Patch',
}

events_data, invalid_events_data = (
    etl
    .fromdataframe(pq.read_table(PUBLISHED / 'combined/programme.parquet').to_pandas())
    .addfield('validation', validation)

    # TODO move to upstream repo
    .convert('project_name', lambda x: x.strip())
    .convert('project_name', canonical_project_name)

    .biselect(lambda r: r.validation == None)

    # .convert('source', {
    #     'Manual': 1,
    #     'Airtable::Project Hub::Event Reports': 2,
    #     'Airtable::Project Hub::Schedule': 3,
    # })
    # .sort('source')
    # .groupselectfirst(['project_name', 'project_id', 'month', 'variable'])
)

In [None]:
dimensions = ['project_name', 'month'] 

events = (
    events_data
    .aggregate([*dimensions, 'variable'],sum, 'value')
    .recast([*dimensions])
)

In [None]:
project_data = (
    events_data
    .convert('start_date', lambda f, r: f or r.date, pass_row=True)
    .convert('end_date', lambda f, r: f or r.date, pass_row=True)
    .aggregate(
        [
            'project_id',
            'project_name',
            # 'programme_category',
            'evaluation_category',
        ],
        {
            'start_date': ( 'start_date', min ),
            'end_date': ( 'end_date', max ),
        }
    )
)

In [None]:
monthly_events = (
    events
    .addfield('events', lambda r: (r.manual_events or 0) + (r.event_reports or r.schedule_events or r.projected_events or 0), index=3)
    .addfield('audience', lambda r: (r.event_report_audience or 0) + (r.manual_audience or 0), index=4)
)

Create an aggregate by month of the events

In [None]:
(
    monthly_events
    .aggregate('month', {
        'Events': ('events', sum),
        'Audience': ('audience', sum),
        # 'Records': (len),
    })
    .convert('month', lambda f: f.isoformat())
    .tocsv(EVENTS / 'total_by_month.csv')
)

Aggregate by Project and by Month, and convert months to columns

In [None]:
(
    monthly_events
    .aggregate(['project_name', 'month'], sum, 'events')
    .recast(key='project_name', variablefield='month', missing=0)
    .tocsv(EVENTS / 'monthly_by_project.csv')
)

Aggregate by Project and by Month, and convert projects to columns

In [None]:
(
    monthly_events
    .aggregate(['project_name', 'month'], sum, 'events')
    .recast(key='month', variablefield='project_name', missing=0)
    .tocsv(EVENTS / 'monthly_breakdown.csv')
)

## Project summaries

In [None]:
project_breakdown = (
    monthly_events
    .melt(variables=['events', 'event_reports', 'schedule_events', 'projected_events', 'manual_events', 'audience', 'event_report_audience', 'manual_audience'])
    .selectnotnone('value')
    .aggregate(['project_name', 'variable'], sum, 'value')
    .recast()
    .leftjoin(project_data)
)

In [None]:
project_breakdown

Create a project breakdown

In [None]:
with open(EVENTS / 'by_project.json', 'w') as f:
    json.dump(
        dict(
            project_breakdown
            .addfield('Details', lambda r: {
                # 'records': r.Records,
                'events': r.events,
                'eventReports': r.event_reports,
                'scheduledEvents': r.schedule_events,
                'projectedEvents': r.projected_events,
                'manual_events': r.manual_events,
                'audience': r.audience,
                'event_reports_audience': r.event_report_audience,
                'manual_audience': r.manual_audience,
                'evaluationCategory': r.evaluation_category,
                # 'programmeCategory': r['Programme Category'],
                'earliestDate': r.start_date.isoformat() if r.start_date else r.date.isoformat() if r.date else None,
                'latestDate': r.end_date.isoformat() if r.end_date else None,
            })
            .cut('project_name', 'Details')
            .sort('project_name')
            .records()
        ),
        f,
        indent=2,
    )

Create a summary file

In [None]:
with open(EVENTS / 'summary.json', 'w') as f:
    json.dump(
        {
            'total': {
                'events': sum(monthly_events.values('events')),
                'audience': sum(a for a in monthly_events.values('audience') if a is not None)
            },
            'excluded': dict(Programme.excluded_events.aggregate('Validation', len).records()),
            'date': {
                'earliest': min(Programme.events.values('Start Date')).isoformat(),
                'latest': max(Programme.events.values('End Date')).isoformat(),
            }
        },
        f,
        indent=2,
    )

## Venues

In [None]:
education_settings = (
    Programme.venues
    .selectcontains('Org/Venue Type', 'Education Setting')
)

In [None]:
loading_bay = (
    Programme.venues
    .selectcontains('Organisation &/or Venue Name', 'Loading Bay')
    .addfield('event_report_count', lambda r: len(r['Event Reports']))
    .aggregate(['Organisation &/or Venue Name', 'id'], sum, 'event_report_count')
    .cutout('id')
)
loading_bay

In [None]:
beacon = (
    Programme.venues
    .selectcontains('Organisation &/or Venue Name', 'Beacon - ')
    .addfield('event_report_count', lambda r: len(r['Event Reports'] or []))
    .aggregate(['Organisation &/or Venue Name', 'id'], sum, 'event_report_count')
    .cutout('id')
)
beacon

In [None]:
json.dump(
    {
        'loading_bay': dict(loading_bay.records()),
        'beacon': dict(beacon.records()),
    },
    open(EVENTS / 'by_venue.json', 'w')
)