In [1]:
from utz import *
import plotly.express as px

In [2]:
Bucket = 'ctbk'

from boto3 import client
from botocore import UNSIGNED
from botocore.client import Config
s3 = client('s3', config=Config())
resp = s3.list_objects_v2(Bucket=Bucket)
contents = pd.DataFrame(resp['Contents'])
keys = contents.Key

In [3]:
months = keys.str.extract('^(?:JC-)?(?P<yyyy>\d{4})(?P<mm>\d{2}).*\.parquet').dropna()
cur_month = months.apply(lambda m: to_dt('%s-%s' % (m.yyyy, m.mm)), axis=1).max()
cur_month

Timestamp('2021-02-01 00:00:00')

In [4]:
def sum_by_wd_gender(key):
    print(f'Aggregating {key}')
    df = read_parquet(f's3://{Bucket}/{key}')
    m = match('(?:(?P<region>JC)-)?(?P<year>\d{4})(?P<month>\d{2})', basename(key))
    year, month = int(m['year']), int(m['month'])
    region = m['region'] or 'NYC'
    df['Region'] = region
    df['Start Year'] = df['Start Time'].dt.year
    df['Start Month'] = df['Start Time'].dt.month
    df['Start Day'] = df['Start Time'].dt.day
    df['Start Weekday'] = df['Start Time'].dt.weekday
    grouped = df.groupby(['Start Year','Start Month','Region','Gender','Start Weekday'])['Start Year'].count().rename('Count')
    return grouped

In [5]:
def group_paths(paths):
    p = Parallel(n_jobs=cpu_count())
    df = concat(p(delayed(sum_by_wd_gender)(path) for path in paths))
    df = df.reset_index()
    df['Month'] = df.apply(lambda r: to_dt('%d-%02d' % (int(r['Start Year']), int(r['Start Month']))), axis=1)
    return df

In [6]:
%%time
grouped_path = 'year-month-region-gender-weekday.parquet'
if exists(grouped_path):
    df = read_parquet(grouped_path)
    last_month = df.apply(lambda r: to_dt('%d-%02d' % (r['Start Year'], r['Start Month'])), axis=1).max()
    if cur_month > last_month:
        new_months = date_range(last_month, cur_month, freq='MS', closed='right')
    new_paths = \
        concat([
            new_months \
            .to_series() \
            .apply(lambda m: '%s%d%02d-citibike-tripdata.parquet' % (region, m.year, m.month))
            for region in ['', 'JC-']
        ])
    print('Aggregating %d new paths:\n\t%s' % (len(new_paths), '\n\t'.join(new_paths)))
    df = concat([df, group_paths(new_paths)])
    df.to_parquet(grouped_path)
    s3.upload_file(grouped_path, Bucket, grouped_path)
else:
    df = group_paths(paths)
    df.to_parquet(grouped_path)

Aggregating 2 new paths:
	202102-citibike-tripdata.parquet
	JC-202102-citibike-tripdata.parquet
CPU times: user 646 ms, sys: 165 ms, total: 812 ms
Wall time: 5.96 s
