## Steps
1. Download all raw data for a given year and month
1. For each raw data, apply the filter before saving it
1. Transform the saved raw data into TS data
1. Convert the ts data into features and targets
1. Save the transformed data


Main objective is to write utility functions to do all these things so we can reuse them later. 

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# Now you can import from src
from src.data_utils import load_and_process_taxi_data

In [None]:
rides = load_and_process_taxi_data(year=2023)

Downloading data for 2024-01...
Successfully downloaded data for 2024-01.
Loading data for 2024-01...
Total records: 2,964,624
Valid records: 2,911,483
Records dropped: 53,141 (1.79%)
Successfully processed data for 2024-01.
Downloading data for 2024-02...
Successfully downloaded data for 2024-02.
Loading data for 2024-02...
Total records: 3,007,526
Valid records: 2,954,709
Records dropped: 52,817 (1.76%)
Successfully processed data for 2024-02.
Downloading data for 2024-03...
Successfully downloaded data for 2024-03.
Loading data for 2024-03...
Total records: 3,582,628
Valid records: 3,518,066
Records dropped: 64,562 (1.80%)
Successfully processed data for 2024-03.
Downloading data for 2024-04...
Successfully downloaded data for 2024-04.
Loading data for 2024-04...
Total records: 3,514,289
Valid records: 3,450,929
Records dropped: 63,360 (1.80%)
Successfully processed data for 2024-04.
Downloading data for 2024-05...
Successfully downloaded data for 2024-05.
Loading data for 2024-05..

In [5]:
rides

Unnamed: 0,pickup_datetime,pickup_location_id
0,2024-01-01 00:57:55,186
1,2024-01-01 00:03:00,140
2,2024-01-01 00:17:06,236
3,2024-01-01 00:36:38,79
4,2024-01-01 00:46:51,211
...,...,...
40334413,2024-12-31 23:14:53,145
40334414,2024-12-31 23:04:32,37
40334415,2024-12-31 23:03:16,181
40334416,2024-12-31 23:15:33,165


In [6]:
from src.data_utils import transform_raw_data_into_ts_data

ts_data = transform_raw_data_into_ts_data(rides)
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,rides
0,2024-01-01 00:00:00,2,0
1,2024-01-01 01:00:00,2,0
2,2024-01-01 02:00:00,2,0
3,2024-01-01 03:00:00,2,0
4,2024-01-01 04:00:00,2,0


In [7]:
ts_data.shape

(2283840, 3)

In [8]:
from src.data_utils import transform_ts_data_info_features_and_target_loop

features, targets = transform_ts_data_info_features_and_target_loop(ts_data, window_size=24*28, step_size=24)


In [9]:
features

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-29,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-30,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-01-31,2
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-01,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2024-02-02,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87875,21,18,9,7,9,7,23,31,28,47,...,77,81,82,66,59,67,46,34,2024-12-27,263
87876,32,18,20,14,6,4,16,23,39,75,...,85,86,98,75,79,77,68,46,2024-12-28,263
87877,72,43,34,19,10,10,20,24,44,47,...,92,103,96,83,68,71,69,65,2024-12-29,263
87878,6,4,3,2,11,14,59,122,146,125,...,88,74,73,70,70,45,45,37,2024-12-30,263


In [10]:
import numpy as np
window_size = 4
step_size = 1
num_windows = 4

step_size * np.arange(num_windows)[:, None]  # Output: array([[0], [1], [2], [3]])

indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [11]:
window_size = 4
step_size = 1
num_windows = 4

In [12]:
import numpy as np
np.arange(num_windows)

array([0, 1, 2, 3])

In [13]:
np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [14]:
step_size * np.arange(num_windows)[:, None]

array([[0],
       [1],
       [2],
       [3]])

In [15]:
indices = np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]


In [16]:
np.arange(window_size)[None, :]

array([[0, 1, 2, 3]])

In [17]:
np.arange(window_size)[None, :] + step_size * np.arange(num_windows)[:, None]

array([[0, 1, 2, 3],
       [1, 2, 3, 4],
       [2, 3, 4, 5],
       [3, 4, 5, 6]])

In [18]:
# Array 1: A 2D array with 2 rows and 2 columns
array1 = np.array([[1, 2],
                   [3, 4]])

# Array 2: A 2D array with 2 rows and 1 column
array2 = np.array([[5],
                   [6]])

# Array 3: A 2D array with 2 rows and 1 column
array3 = np.array([[7],
                   [8]])


np.hstack([array1, array2, array3])

array([[1, 2, 5, 7],
       [3, 4, 6, 8]])

In [19]:
from src.data_utils import transform_ts_data_info_features_and_target_loop

features, targets = transform_ts_data_info_features_and_target_loop(ts_data, window_size=24*28*1, step_size=24)


In [20]:
tabular_data = features
tabular_data["target"] = targets

from src.config import TRANSFORMED_DATA_DIR
tabular_data.to_parquet(TRANSFORMED_DATA_DIR / "tabular_data.parquet", engine="pyarrow")

In [22]:
tabular_data

Unnamed: 0,rides_t-672,rides_t-671,rides_t-670,rides_t-669,rides_t-668,rides_t-667,rides_t-666,rides_t-665,rides_t-664,rides_t-663,...,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-29,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-30,2,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-01-31,2,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-01,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2024-02-02,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87875,21,18,9,7,9,7,23,31,28,47,...,81,82,66,59,67,46,34,2024-12-27,263,21
87876,32,18,20,14,6,4,16,23,39,75,...,86,98,75,79,77,68,46,2024-12-28,263,40
87877,72,43,34,19,10,10,20,24,44,47,...,103,96,83,68,71,69,65,2024-12-29,263,35
87878,6,4,3,2,11,14,59,122,146,125,...,74,73,70,70,45,45,37,2024-12-30,263,13


In [23]:
features, targets = transform_ts_data_info_features_and_target_loop(ts_data[ts_data["pickup_location_id"]==43], window_size=12, step_size=1)


In [24]:
features

Unnamed: 0,rides_t-12,rides_t-11,rides_t-10,rides_t-9,rides_t-8,rides_t-7,rides_t-6,rides_t-5,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,161,87,36,14,5,3,4,12,10,14,28,55,2024-01-01 12:00:00,43
1,87,36,14,5,3,4,12,10,14,28,55,48,2024-01-01 13:00:00,43
2,36,14,5,3,4,12,10,14,28,55,48,73,2024-01-01 14:00:00,43
3,14,5,3,4,12,10,14,28,55,48,73,106,2024-01-01 15:00:00,43
4,5,3,4,12,10,14,28,55,48,73,106,123,2024-01-01 16:00:00,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8767,13,20,42,60,89,106,115,149,141,191,96,61,2024-12-31 19:00:00,43
8768,20,42,60,89,106,115,149,141,191,96,61,72,2024-12-31 20:00:00,43
8769,42,60,89,106,115,149,141,191,96,61,72,51,2024-12-31 21:00:00,43
8770,60,89,106,115,149,141,191,96,61,72,51,45,2024-12-31 22:00:00,43


In [25]:
ts_data[ts_data["pickup_location_id"]==43].head(36)

Unnamed: 0,pickup_hour,pickup_location_id,rides
360144,2024-01-01 00:00:00,43,161
360145,2024-01-01 01:00:00,43,87
360146,2024-01-01 02:00:00,43,36
360147,2024-01-01 03:00:00,43,14
360148,2024-01-01 04:00:00,43,5
360149,2024-01-01 05:00:00,43,3
360150,2024-01-01 06:00:00,43,4
360151,2024-01-01 07:00:00,43,12
360152,2024-01-01 08:00:00,43,10
360153,2024-01-01 09:00:00,43,14


In [26]:
from src.data_utils import transform_ts_data_info_features_and_target

features, targets = transform_ts_data_info_features_and_target(ts_data[ts_data["pickup_location_id"]==43], window_size=4, step_size=1)


In [27]:
features

Unnamed: 0,rides_t-4,rides_t-3,rides_t-2,rides_t-1,pickup_hour,pickup_location_id
0,161,87,36,14,2024-01-01 04:00:00,43
1,87,36,14,5,2024-01-01 05:00:00,43
2,36,14,5,3,2024-01-01 06:00:00,43
3,14,5,3,4,2024-01-01 07:00:00,43
4,5,3,4,12,2024-01-01 08:00:00,43
...,...,...,...,...,...,...
8775,141,191,96,61,2024-12-31 19:00:00,43
8776,191,96,61,72,2024-12-31 20:00:00,43
8777,96,61,72,51,2024-12-31 21:00:00,43
8778,61,72,51,45,2024-12-31 22:00:00,43


In [28]:
from src.data_utils import transform_ts_data_info_features_and_target

In [29]:
features, targets = transform_ts_data_info_features_and_target(ts_data[ts_data["pickup_location_id"]==43], window_size=4, step_size=1)
