# Partition Raw Parking Ticket Data
The raw parking ticket data is too large to be uploaded into the shared Github repository, so instead, we will shard it into smaller files and compress the data.

In [None]:
import os

import pandas as pd

max_file_size = 4 * 1024 * 1024 # 4 MB

In [2]:
in_file = "../data/raw_data/parking-tickets.csv"
out_dir = "../data/raw_data/parking_tickets"
os.makedirs(out_dir, exist_ok=True)
out_file_prefix = "parking_tickets"

In [3]:
df = pd.read_csv(in_file, sep=";", engine="python", on_bad_lines="skip")
df.head()

Unnamed: 0,Block,Street,EntryDate,Bylaw,Section,Status,InfractionText,Year,BI_ID
0,1100,DAVIE ST,2023-05-01,2952,5(4)(B),IS,PARK IN A METERED SPACE IF THE TIME RECORDED B...,2023,4487040
1,1500,COAL HARBOUR QUAY,2023-05-01,2952,5(4)(A)(ii),IS,PARK IN A METERED SPACE IF THE PARKING METER H...,2023,4487044
2,1500,COAL HARBOUR QUAY,2023-05-01,2952,5(4)(A)(ii),IS,PARK IN A METERED SPACE IF THE PARKING METER H...,2023,4487045
3,1000,ROBSON ST,2023-05-01,2952,5(4)(A)(ii),IS,PARK IN A METERED SPACE IF THE PARKING METER H...,2023,4487049
4,1100,ROBSON ST,2023-05-01,2952,5(4)(A)(ii),IS,PARK IN A METERED SPACE IF THE PARKING METER H...,2023,4487050


In [5]:
chunk_index = 1
rows_written = 0
while rows_written < len(df):
    step = 300000  # starting guess of 300,000 rows per chunk

    while True:
        chunk_df = df.iloc[rows_written:rows_written + step]
        temp_path = os.path.join(out_dir, f"temp_chunk.json.gz")

        # Write to temporary file using pandas
        chunk_df.to_json(temp_path, orient="records", lines=True, compression="gzip")

        file_size = os.path.getsize(temp_path)
        if file_size <= max_file_size or step <= 100:
            break
        step = int(step * 0.8)  # shrink the chunk size

    # Rename temp file to final name
    final_path = os.path.join(out_dir, f"{out_file_prefix}_{chunk_index}.json.gz")
    os.rename(temp_path, final_path)

    print(f"Wrote {final_path} — {file_size / (1024 * 1024):.2f} MB, {len(chunk_df)} rows")

    rows_written += len(chunk_df)
    chunk_index += 1

Wrote ../data/raw_data/parking_tickets/parking_tickets_1.json.gz — 3.79 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_2.json.gz — 3.91 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_3.json.gz — 3.89 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_4.json.gz — 3.98 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_5.json.gz — 3.94 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_6.json.gz — 3.59 MB, 300000 rows
Wrote ../data/raw_data/parking_tickets/parking_tickets_7.json.gz — 2.72 MB, 220593 rows


## Sanity Check

In [6]:
chunked_files = sorted([f for f in os.listdir(out_dir) if f.endswith(".json.gz")])
reconstructed_df = pd.concat([
    pd.read_json(os.path.join(out_dir, f), orient='records', lines=True)
    for f in chunked_files
], ignore_index=True)

equal = df.equals(reconstructed_df)

if equal:
    print("Reconstructed data matches original!")
else:
    diff_count = (df != reconstructed_df).sum().sum()
    print(f"Data mismatch detected ({diff_count} differing values).")

Reconstructed data matches original!
