In [1]:
from utz import *

In [2]:
cache = '/cache'

In [3]:
Bucket = 'tripdata'

from boto3 import client
from botocore import UNSIGNED
from botocore.client import Config
s3 = client('s3', config=Config(signature_version=UNSIGNED))

In [4]:
resp = s3.list_objects_v2(Bucket=Bucket)
contents = pd.DataFrame(resp['Contents'])
zips = contents[contents.Key.str.endswith('.zip')]
zips

Unnamed: 0,Key,LastModified,ETag,Size,StorageClass
0,201306-citibike-tripdata.zip,2018-04-30 13:18:55+00:00,"""b520a12de58eea58a3586f89bfcfbd9d-2""",16785103,STANDARD
1,201307-201402-citibike-tripdata.zip,2017-01-18 22:23:25+00:00,"""7b3b260b2ab2e5349320121d04bd821c-22""",178262576,STANDARD
2,201307-citibike-tripdata.zip,2017-01-18 22:23:27+00:00,"""dd3e6fd5f91715b31eae72868086c08c-4""",27074629,STANDARD
3,201308-citibike-tripdata.zip,2017-01-18 22:23:27+00:00,"""2f661063576734f614b9f1d6bba0ec59-4""",32090869,STANDARD
4,201309-citibike-tripdata.zip,2017-01-18 22:23:27+00:00,"""a42f947db7bd14e423a7dbfbb11596a1-4""",33155593,STANDARD
...,...,...,...,...,...
155,JC-202010-citibike-tripdata.csv.zip,2020-11-04 14:51:11+00:00,"""148431d3598f7e962338c33da2afddf3""",798066,STANDARD
156,JC-202011-citibike-tripdata.csv.zip,2020-12-04 23:26:04+00:00,"""ab9ee4bbbc03633d610e18319d23fc21""",569245,STANDARD
157,JC-202012-citibike-tripdata.csv.zip,2021-01-05 14:25:45+00:00,"""112033c48cf3fa673b396364a7cc08f6""",315012,STANDARD
158,JC-202101-citibike-tripdata.csv.zip,2021-02-08 15:11:26+00:00,"""02e5da50db92added528f438587bb1e7""",313806,STANDARD


In [5]:
rgx = r'^(?P<JC>JC-)?(?P<year>\d{4})(?P<month>\d{2})[ \-]citibike-tripdata(?P<csv>\.csv)?(?P<zip>\.zip)?$'

In [6]:
fields = {
  'Trip Duration',
  'Start Time',
  'Stop Time',
  'Start Station ID',
  'Start Station Name',
  'Start Station Latitude',
  'Start Station Longitude',
  'End Station ID',
  'End Station Name',
  'End Station Latitude',
  'End Station Longitude',
  'Bike ID',
  'User Type',
  'Birth Year',
  'Gender'
}
def normalize_field(f): return sub(r'\s', '', f.lower())
normalize_fields_map = { normalize_field(f): f for f in fields }
normalize_fields_map

{'endstationlatitude': 'End Station Latitude',
 'tripduration': 'Trip Duration',
 'startstationlatitude': 'Start Station Latitude',
 'endstationlongitude': 'End Station Longitude',
 'bikeid': 'Bike ID',
 'usertype': 'User Type',
 'startstationid': 'Start Station ID',
 'startstationlongitude': 'Start Station Longitude',
 'endstationid': 'End Station ID',
 'startstationname': 'Start Station Name',
 'starttime': 'Start Time',
 'gender': 'Gender',
 'endstationname': 'End Station Name',
 'birthyear': 'Birth Year',
 'stoptime': 'Stop Time'}

In [7]:
def normalize_fields(df):
    return df.rename(columns={
        col: normalize_fields_map[normalize_field(col)]
        for col in df.columns
    })

In [8]:
from zipfile import ZipFile

In [9]:
def to_parquet(zip_path, error='warn', overwrite=False):
    name = basename(zip_path)
    m = match(rgx, name)
    if not m:
        msg = f'Unrecognized key: {name}'
        if error == 'warn':
            print(msg)
            return msg
        else:
            raise Exception(msg)
    assert name.endswith('.zip'), name
    base = splitext(zip_path)[0]
    if base.endswith('.csv'):
        base = splitext(base)[0]

    pqt_path = f'{base}.parquet'
    if exists(pqt_path):
        if overwrite:
            msg = f'Overwrote {pqt_path}'
            print(f'Overwriting {pqt_path}')
        else:
            msg = f'Found {pqt_path}; skipping'
            print(msg)
            return msg
    else:
        msg = f'Wrote {pqt_path}'

    z = ZipFile(zip_path)
    names = z.namelist()
    print(f'{name}: zip names: {names}')
    [ name ] = [ f for f in names if f.endswith('.csv') and not f.startswith('_') ]
    with z.open(name,'r') as i:
        df = pd.read_csv(i)
        df = normalize_fields(df)
        df = df.astype({'Start Time':'datetime64[ns]','Stop Time':'datetime64[ns]'})
        df.to_parquet(pqt_path)

    return msg

In [10]:
cached_zips = sorted(glob(f'{cache}/*.zip'))
[ basename(z) for z in cached_zips ]

['201306-citibike-tripdata.zip',
 '201307-201402-citibike-tripdata.zip',
 '201307-citibike-tripdata.zip',
 '201308-citibike-tripdata.zip',
 '201309-citibike-tripdata.zip',
 '201310-citibike-tripdata.zip',
 '201311-citibike-tripdata.zip',
 '201312-citibike-tripdata.zip',
 '201401-citibike-tripdata.zip',
 '201402-citibike-tripdata.zip',
 '201403-citibike-tripdata.zip',
 '201404-citibike-tripdata.zip',
 '201405-citibike-tripdata.zip',
 '201406-citibike-tripdata.zip',
 '201407-citibike-tripdata.zip',
 '201408-citibike-tripdata.zip',
 '201409-citibike-tripdata.zip',
 '201410-citibike-tripdata.zip',
 '201411-citibike-tripdata.zip',
 '201412-citibike-tripdata.zip',
 '201501-citibike-tripdata.zip',
 '201502-citibike-tripdata.zip',
 '201503-citibike-tripdata.zip',
 '201504-citibike-tripdata.zip',
 '201505-citibike-tripdata.zip',
 '201506-citibike-tripdata.zip',
 '201507-citibike-tripdata.zip',
 '201508-citibike-tripdata.zip',
 '201509-citibike-tripdata.zip',
 '201510-citibike-tripdata.zip',
 '2

In [11]:
from joblib import delayed, Parallel
parallel = Parallel(n_jobs=cpu_count())

In [12]:
cached_zips

['/cache/201306-citibike-tripdata.zip',
 '/cache/201307-201402-citibike-tripdata.zip',
 '/cache/201307-citibike-tripdata.zip',
 '/cache/201308-citibike-tripdata.zip',
 '/cache/201309-citibike-tripdata.zip',
 '/cache/201310-citibike-tripdata.zip',
 '/cache/201311-citibike-tripdata.zip',
 '/cache/201312-citibike-tripdata.zip',
 '/cache/201401-citibike-tripdata.zip',
 '/cache/201402-citibike-tripdata.zip',
 '/cache/201403-citibike-tripdata.zip',
 '/cache/201404-citibike-tripdata.zip',
 '/cache/201405-citibike-tripdata.zip',
 '/cache/201406-citibike-tripdata.zip',
 '/cache/201407-citibike-tripdata.zip',
 '/cache/201408-citibike-tripdata.zip',
 '/cache/201409-citibike-tripdata.zip',
 '/cache/201410-citibike-tripdata.zip',
 '/cache/201411-citibike-tripdata.zip',
 '/cache/201412-citibike-tripdata.zip',
 '/cache/201501-citibike-tripdata.zip',
 '/cache/201502-citibike-tripdata.zip',
 '/cache/201503-citibike-tripdata.zip',
 '/cache/201504-citibike-tripdata.zip',
 '/cache/201505-citibike-tripdata

In [13]:
parallel(delayed(to_parquet)(f) for f in cached_zips)

['Found /cache/201306-citibike-tripdata.parquet; skipping',
 'Unrecognized key: 201307-201402-citibike-tripdata.zip',
 'Found /cache/201307-citibike-tripdata.parquet; skipping',
 'Found /cache/201308-citibike-tripdata.parquet; skipping',
 'Found /cache/201309-citibike-tripdata.parquet; skipping',
 'Found /cache/201310-citibike-tripdata.parquet; skipping',
 'Found /cache/201311-citibike-tripdata.parquet; skipping',
 'Found /cache/201312-citibike-tripdata.parquet; skipping',
 'Found /cache/201401-citibike-tripdata.parquet; skipping',
 'Found /cache/201402-citibike-tripdata.parquet; skipping',
 'Found /cache/201403-citibike-tripdata.parquet; skipping',
 'Found /cache/201404-citibike-tripdata.parquet; skipping',
 'Found /cache/201405-citibike-tripdata.parquet; skipping',
 'Found /cache/201406-citibike-tripdata.parquet; skipping',
 'Found /cache/201407-citibike-tripdata.parquet; skipping',
 'Found /cache/201408-citibike-tripdata.parquet; skipping',
 'Found /cache/201409-citibike-tripdata.pa

In [14]:
import dask.dataframe as dd

In [15]:
%%time
dfs = dd.read_parquet(f'{cache}/*.parquet')
dfs

CPU times: user 131 ms, sys: 79.8 ms, total: 211 ms
Wall time: 498 ms


Unnamed: 0_level_0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender
npartitions=157,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
,int64,datetime64[ns],datetime64[ns],int64,object,float64,float64,float64,object,float64,float64,int64,object,float64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [16]:
%%time
len(dfs)

CPU times: user 7.37 s, sys: 21.2 s, total: 28.6 s
Wall time: 6.28 s


114249284