In [6]:
import pandas as pd
import numpy as np
from pprint import pprint
import pyarrow as pa
import pyarrow.parquet as pq

df = pd.read_csv("/Users/lx/Development/timeseries-modeling/data/bike_day.csv")
# make dteday into a proper datetime
df['dteday'] = pd.to_datetime(df['dteday'])
# drop everything except timestamp, registered, cnt
df = df.drop(columns=["season", "yr", "mnth", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed", "instant", "cnt"])
df = pd.melt(df, id_vars=['dteday'], value_vars=['casual', 'registered']).groupby('variable').agg(list).reset_index()
df.rename(columns={"variable": "id", "dteday": "timestamp", "value": "target"}, inplace=True)
print(df)

           id                                          timestamp  \
0      casual  [2011-01-01 00:00:00, 2011-01-02 00:00:00, 201...   
1  registered  [2011-01-01 00:00:00, 2011-01-02 00:00:00, 201...   

                                              target  
0  [331, 131, 120, 108, 82, 88, 148, 68, 54, 41, ...  
1  [654, 670, 1229, 1454, 1518, 1518, 1362, 891, ...  


In [3]:
def extract_target(df: pd.DataFrame, start: str, end: str):
    extract_df_casual = sorted((ts,targ) for ts,targ in zip(df['timestamp'][0], df['target'][0]) if ts >= pd.to_datetime(start) and ts <= pd.to_datetime(end))
    extract_df_casual = list(map(list, zip(*extract_df_casual)))

    extract_df_registered = sorted((ts,targ) for ts,targ in zip(df['timestamp'][1], df['target'][1]) if ts >= pd.to_datetime(start) and ts <= pd.to_datetime(end))
    extract_df_registered = list(map(list, zip(*extract_df_registered)))

    output_df = df.copy()
    output_df.loc[0, 'timestamp'] = extract_df_casual[0]
    output_df.loc[0, 'target'] = extract_df_casual[1]
    output_df.loc[1, 'timestamp'] = extract_df_registered[0]
    output_df.loc[1, 'target'] = extract_df_registered[1]
    

    return output_df

In [11]:
config = {
    'july': {'start': '2011-04-01',
           'end': '2011-07-31'},
    'week10': {'start':'2011-02-14',
             'end':'2011-03-13'},
    'q4': {'start':'2011-09-01',
         'end':'2012-11-30'}
}

for key, dates in config.items():
    temp_df = extract_target(df, **dates)
    temp_df.to_parquet(f'bike_day_{key}/bike_day_{key}.parquet', index=False)
    
    temp_casual = pd.DataFrame(temp_df.loc[0]).T
    temp_registered = pd.DataFrame(temp_df.loc[1]).T

    pq.write_table(pa.Table.from_pandas(temp_casual), f'bike_day_{key}/casual/bike_day_{key}.parquet')
    pq.write_table(pa.Table.from_pandas(temp_registered), f'bike_day_{key}/registered/bike_day_{key}.parquet')
    
# output_df.head()