# Transform data into more manageable format

In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import sys

from pathlib import Path

import pathhelper

In [2]:
print("Python: " + sys.version)
print("Pandas: " + pd.__version__)

Python: 3.11.10 | packaged by conda-forge | (main, Sep 30 2024, 18:08:57) [GCC 13.3.0]
Pandas: 2.2.3


# Use Pyarrow

In [13]:
data_dir = pathhelper.data_dir(".")
filename = data_dir / "pedestrians_cyclists/Trails_Counters_Pedestrians_Cyclists.csv"

at_uses_df = pd.read_csv(filename, engine="pyarrow")
at_uses_df.columns = at_uses_df.columns.str.lower()

In [4]:
at_uses_df.columns

Index(['', 'datetime', 'count', 'site_name', 'site_id', 'flow_id', 'flow_name',
       'user_type', 'direction', 'year', 'month', 'day', 'hour', 'minute'],
      dtype='object')

In [5]:
#at_uses_df["count"] = at_uses_df["count"].fillna(0)

at_uses_df.drop("", axis=1, inplace=True)
at_uses_df.drop("datetime", axis=1, inplace=True)

In [22]:
# Narrowing columns
def narrow(df, column, type):
    if df[column].max() > np.iinfo(np.int16).max:
        raise OverflowError()
    return df[column].astype(type)


at_uses_df["count"] = narrow(at_uses_df, "count", "Int16")  # Nullable
at_uses_df["year"] = narrow(at_uses_df, "year", "int16")
at_uses_df["month"] = narrow(at_uses_df, "month", "int8")
at_uses_df["day"] = narrow(at_uses_df, "day", "int8")
at_uses_df["hour"] = narrow(at_uses_df, "hour", "int8")
at_uses_df["minute"] = narrow(at_uses_df, "minute", "int8")

#at_uses_df["site_name"] = at_uses_df["site_name"].astype("category")
#at_uses_df["site_id"] = at_uses_df["site_id"].astype("category")
#at_uses_df["flow_id"] = at_uses_df["flow_id"].astype("category")
#at_uses_df["flow_name"] = at_uses_df["flow_name"].astype("category")
#at_uses_df["user_type"] = at_uses_df["user_type"].astype("category")
#at_uses_df["direction"] = at_uses_df["direction"].astype("category")

In [23]:
at_uses_df.dtypes

count         Int16
site_name    object
site_id       int64
flow_id       int64
flow_name    object
user_type    object
direction    object
year          int16
month          int8
day            int8
hour           int8
minute         int8
dtype: object

In [24]:
mem = at_uses_df.memory_usage(deep=True)
print(mem)
print("-"*25)
print(f"{int(mem.sum() / 1000000)} MB")

Index               132
count          37338048
site_name    1030740480
site_id        99568128
flow_id        99568128
flow_name    1057911360
user_type     807062784
direction     744482112
year           24892032
month          12446016
day            12446016
hour           12446016
minute         12446016
dtype: int64
-------------------------
3951 MB


In [25]:
hourly_df = at_uses_df.groupby(by=["site_name", "site_id", "flow_id", "flow_name", "user_type", "direction", "year", "month", "day", "hour"])["count"].sum()
hourly_df = hourly_df.reset_index()

In [26]:
hourly_df["dayofweek"] = pd.to_datetime(hourly_df[['year', 'month', 'day']]).dt.dayofweek
hourly_df["week"] = pd.to_datetime(hourly_df[['year', 'month', 'day']]).dt.isocalendar().week

In [27]:
filename = data_dir / "pedestrians_cyclists/Trails_Counters_Pedestrians_Cyclists-hourly-narrow"
hourly_df.to_csv(filename.with_suffix('.csv'))
hourly_df.to_feather(filename.with_suffix('.feather'))