# Volunteering Data

Data extracted from Rosterfy and processes below.

In [1]:
import json
from datetime import date, datetime
from pathlib import Path

import petl as etl
from utils import make_cumulative, make_time_series

In [2]:
induction_launch = date.fromisoformat('2024-09-12')
first_event = date.fromisoformat('2024-08-16')

Set up references to paths

In [3]:
ROOT = Path('../')
DATA = ROOT / 'data/published/volunteers'
_TARGET = ROOT / 'src/themes/volunteers/_data'
PEOPLE = _TARGET / 'people'
PEOPLE.mkdir(exist_ok=True, parents=True)
SHIFTS = _TARGET / 'shifts'
SHIFTS.mkdir(exist_ok=True, parents=True)

## Checkpoints

Read checkpoints data from CSV process.

In [4]:
checkpoints = etl.fromcsv(
    DATA / 'checkpoints.csv'
).convert(
    'date', etl.dateparser('%Y-%m-%d')
).convert('count', int).sort(
    key=('date', 'checkpoint')
).recast(
    variablefield='checkpoint', valuefield='count'
).replaceall(
    None, 0
)

In [5]:
checkpoint_updates = (
    etl.fromcsv(DATA / 'checkpoint-updates.csv')
    .selectin('checkpoint', ['1. Monitoring & Evaluation', '2. Sign Up to Induction', '3. Fully Inducted Volunteers'])
    .convert('date', etl.dateparser('%Y-%m-%d'))
    .convert('count', int)
    .recast(variablefield='checkpoint', valuefield='count')
    .replaceall(None, 0)
)

Create summarise function. This takes a Pandas dataframe, resamples based on the desired frequency `freq`, adds cumulative counts of each column, converts to a PETL table and renames the date column based on the `title` provided.

In [6]:
def summarise(df, freq, title):
    return (
        df
            .pipe(make_time_series)
            .resample(freq)
            .sum()
            .pipe(make_cumulative)
            .reset_index()
            .pipe(etl.fromdataframe)
            .convert('date', datetime.date)
            .rename({ 'date': title })
    )

Create a weekly summary of checkpoints passed

In [7]:
checkpoints.todataframe(
).pipe(
    summarise, 'W-SUN', 'Week ending (Sunday)'
).selectge('Week ending (Sunday)', induction_launch).tocsv(PEOPLE / 'checkpoints_weekly.csv')

In [8]:
(
    checkpoint_updates
    .todataframe()
    .pipe(summarise, 'W-SUN', 'Week ending (Sunday)')
    .selectge('Week ending (Sunday)', induction_launch)
    .addfield('1. Monitoring & Evaluation cumulative (incl)', lambda r: r['1. Monitoring & Evaluation cumulative']
                + r['2. Sign Up to Induction cumulative']
                + r['3. Fully Inducted Volunteers cumulative'])
    .addfield('2. Sign Up to Induction cumulative (incl)', lambda r: r['2. Sign Up to Induction cumulative']
                + r['3. Fully Inducted Volunteers cumulative'])
    .tocsv(PEOPLE / 'checkpoints_weekly_updates.csv')
)

## Geography

In [9]:
geo = etl.fromcsv(DATA / 'geo-summary.csv')
for geography, table in geo.facet('type').items():
    table.tocsv(PEOPLE / f'by_geo_{geography}.csv')

## Demographics

In [10]:
demo = etl.fromcsv(DATA / 'demographics.csv').selectin('category', [
    'age_range',
])

for category, table in demo.facet('category').items():
    table.tocsv(PEOPLE / f'by_demographic_{category}.csv')

## Shifts

In [11]:
shifts = etl.fromcsv(
    DATA / 'shifts.csv'
).convert(
    ('demand', 'attended'), int
).convert(
    ('hours'), float
).replaceall(
    None, 0
).aggregate(
    key=['date', 'type'],
    aggregation={
        'attended': ('attended', sum),
        'hours': ('hours', sum)
    }
).convert(
    'date', date.fromisoformat
).cache()

Create some summary functions with a combination of PETL and Pandas

In [12]:
def summarise(table: etl.Table, column, freq) -> etl.Table:
    return (
        table
        .cut('date', 'type', column)
        .recast(variablefield='type', valuefield=column, reducers=sum)
        .replaceall(None, 0)
        .todataframe()
        .pipe(make_time_series)
        .resample(freq)
        .sum()
        .reset_index()
        .pipe(etl.fromdataframe)
        .convert('date', datetime.date)
        .sort('date')
    )    

In [26]:
summarise(
    shifts, 'attended', 'W-SUN'
).selectge(
    'date', first_event
).rename(
    'date', 'week_ending'
).tocsv(SHIFTS / 'attended_by_week.csv')

summarise(
    shifts, 'hours', 'W-SUN'
).selectge(
    'date', first_event
).melt(
    'date'
).convert(
    'value', lambda f: round(f, 3)
).recast(
).rename(
    'date', 'week_ending'
).tocsv(SHIFTS / 'hours_by_week.csv')

In [27]:
summarise(
    shifts, 'attended', 'W-SUN'
).todataframe(
).set_index('date').cumsum().reset_index(
).pipe(
    etl.fromdataframe
).selectge(
    'date', first_event
).rename(
    'date', 'week_ending'
).tocsv(SHIFTS / 'attended_cumulative_by_week.csv')

summarise(
    shifts, 'hours', 'W-SUN'
).todataframe(
).set_index('date').cumsum().reset_index(
).pipe(
    etl.fromdataframe
).selectge(
    'date', first_event
).melt(
    'date'
).convert(
    'value', lambda f: round(f, 3)
).recast(
).rename(
    'date', 'week_ending'
).tocsv(SHIFTS / 'hours_cumulative_by_week.csv')

### Process summaries

Convert timestamp columns to datetime to work with OI Lume Viz

In [None]:
# checkpoints_monthly['Month ending'] = pd.to_datetime(checkpoints_monthly['Month ending']).dt.strftime('%Y-%m-%d')
# shifts_monthly['Month ending'] = pd.to_datetime(shifts_monthly['month ending']).dt.strftime('%Y-%m-%d')
# shifts_weekly['Month ending'] = pd.to_datetime(shifts_weekly['week ending']).dt.strftime('%Y-%m-%d')

# checkpoints_monthly.to_csv(os.path.join(OUT_DIR, 'checkpoints_monthly.csv'), index=False)
# shifts_monthly.to_csv(os.path.join(OUT_DIR, 'shifts_monthly.csv'), index=False)
# shifts_weekly.to_csv(os.path.join(OUT_DIR, 'shifts_weekly.csv'), index=False)

In [None]:
aggregation = {f: (f, sum) for f in checkpoint_updates.header()[1:]}
checkpoint_summary = dict(
    checkpoint_updates
    .aggregate(None, aggregation)
    .rename({
        '1. Monitoring & Evaluation': 'Signed up',
        '2. Sign Up to Induction': 'Induction booked',
        '3. Fully Inducted Volunteers': 'Induction completed'
    })
    .convert('Signed up', lambda f, r: f + r['Induction booked'] + r['Induction completed'], pass_row=True)
    .convert('Induction booked', lambda f, r: f + r['Induction completed'], pass_row=True)
    .transpose()
)

In [None]:
shifts_summary = dict(
    shifts
    .melt(['date', 'type'])
    .aggregate(['type', 'variable'], sum, 'value')
    .sort('variable')
    .convert('variable', {
        'attended': 'volunteer shifts',
        'hours': 'volunteer hours'
    })
    .addfield('title', lambda r: f'{r.type} {r.variable}')
    .cut('title', 'value')
    .records()
)

In [None]:
json.dump(
    checkpoint_summary | shifts_summary,
    open(_TARGET / 'summary.json', 'w'),
    indent=2
)