In [4]:
import os
import glob
import sys
import pathlib
from itertools import product

import pandas as pd
import pyarrow as pa

In [42]:
raw_data_path = "../../data/raw"
save_folder_path = "../../data/parquet/taps"

In [59]:
def get_paths(scan_type, year, week):
    return glob.glob(
        raw_data_path + f"/**/Scan{scan_type.title()}Transaction/{year}/{week.title()}/*.gz",
        recursive=True)


def read_data_file(path):
    return pd.read_csv(
        path,
        sep="|",
        compression="gzip",
        names = [
            'mode',
            'business_date',
            'datetime',
            'card_id',
            'card_type',
            'vehicle_id',
            'parent_route',
            'route_id',
            'stop_id']
        )


def read_week_data(year, week):
    def read_file():
        for tap in ["on", "off"]:
            for path in get_paths(tap, year, week):
                yield read_data_file(path).assign(tap=tap)
    
    return pd.concat(list(read_file())).reset_index()


def write_week_data(df, year, week):
    df.to_parquet(
        save_folder_path + f"/{year}_{week.lower()}.parquet",
        engine = "pyarrow"
    )


def convert_taps(year, week):
    print(f"Reading data for {year} {week}.")
    try:
        df = read_week_data(year, week)
    except ValueError:
        print("No data to be read, skipping.")
    else:
        print("Writing.")
        write_week_data(df, year, week)
        print("Write complete.")

In [60]:
weeks = ["Week" + str(i) for i in range(1, 54)]
years = list(range(2015, 2019))

In [62]:
for year, week in product(years, weeks):
    convert_taps(year, week)

Reading data for 2015 Week1.
No data to be read, skipping.
Reading data for 2015 Week2.
No data to be read, skipping.
Reading data for 2015 Week3.
No data to be read, skipping.
Reading data for 2015 Week4.
No data to be read, skipping.
Reading data for 2015 Week5.
No data to be read, skipping.
Reading data for 2015 Week6.
No data to be read, skipping.
Reading data for 2015 Week7.
No data to be read, skipping.
Reading data for 2015 Week8.
No data to be read, skipping.
Reading data for 2015 Week9.
No data to be read, skipping.
Reading data for 2015 Week10.
No data to be read, skipping.
Reading data for 2015 Week11.
No data to be read, skipping.
Reading data for 2015 Week12.
No data to be read, skipping.
Reading data for 2015 Week13.
No data to be read, skipping.
Reading data for 2015 Week14.
No data to be read, skipping.
Reading data for 2015 Week15.
No data to be read, skipping.
Reading data for 2015 Week16.
No data to be read, skipping.
Reading data for 2015 Week17.
No data to be read,

Writing.
Write complete.
Reading data for 2017 Week43.
Writing.
Write complete.
Reading data for 2017 Week44.
Writing.
Write complete.
Reading data for 2017 Week45.
Writing.
Write complete.
Reading data for 2017 Week46.
Writing.
Write complete.
Reading data for 2017 Week47.
Writing.
Write complete.
Reading data for 2017 Week48.
Writing.
Write complete.
Reading data for 2017 Week49.
Writing.
Write complete.
Reading data for 2017 Week50.
Writing.
Write complete.
Reading data for 2017 Week51.
Writing.
Write complete.
Reading data for 2017 Week52.
Writing.
Write complete.
Reading data for 2017 Week53.
Writing.
Write complete.
Reading data for 2018 Week1.
No data to be read, skipping.
Reading data for 2018 Week2.
Writing.
Write complete.
Reading data for 2018 Week3.
Writing.
Write complete.
Reading data for 2018 Week4.
Writing.
Write complete.
Reading data for 2018 Week5.
Writing.
Write complete.
Reading data for 2018 Week6.
Writing.
Write complete.
Reading data for 2018 Week7.
Writing.
Wri