In [1]:
import pandas as pd

ts_data = pd.read_parquet('../data/processed/ts_data_2023_01.parquet')
ts_data.head()

Unnamed: 0,pickup_hour,pickup_location_id,ride_count
0,2023-01-01,4,19
1,2023-01-01,7,3
2,2023-01-01,12,1
3,2023-01-01,13,14
4,2023-01-01,24,20


In [2]:
ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == 43, :].reset_index(drop=True)
ts_data_one_location.head(20)

Unnamed: 0,pickup_hour,pickup_location_id,ride_count
0,2023-01-01 00:00:00,43,93
1,2023-01-01 01:00:00,43,81
2,2023-01-01 02:00:00,43,30
3,2023-01-01 03:00:00,43,15
4,2023-01-01 04:00:00,43,4
5,2023-01-01 05:00:00,43,4
6,2023-01-01 06:00:00,43,4
7,2023-01-01 07:00:00,43,12
8,2023-01-01 08:00:00,43,12
9,2023-01-01 09:00:00,43,23


In [4]:
def get_cutoff_indexes(
        data: pd.DataFrame,
        n_features: int,
        step_size: int
) -> list:
    stop_position = len(data) - 1

    subseq_first_index = 0
    subseq_mid_index = n_features
    subseq_last_index = n_features + 1
    indexes = []

    while subseq_last_index < stop_position:
        indexes.append((subseq_first_index, subseq_mid_index, subseq_last_index))
        subseq_first_index += step_size
        subseq_mid_index += step_size
        subseq_last_index += step_size
    return indexes

In [5]:
n_features = 24
step_size = 1

indexes = get_cutoff_indexes(ts_data_one_location, n_features, step_size)
indexes[:5]

[(0, 24, 25), (1, 25, 26), (2, 26, 27), (3, 27, 28), (4, 28, 29)]

In [7]:
import numpy as np

n_examples = len(indexes)
x = np.ndarray(shape=(n_examples, n_features), dtype=np.float32)
y = np.ndarray(shape=(n_examples), dtype=np.float32)
pickup_hours = []

for i, index in enumerate(indexes):
    x[i, :] = ts_data_one_location.iloc[index[0]:index[1]]['ride_count'].values
    y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
    pickup_hours.append(ts_data_one_location.iloc[index[1]]['pickup_hour'])

  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values


In [8]:
print(f'{x.shape=}')
print(f'{x=}')
print(f'{pickup_hours[:5]=}')

x.shape=(670, 24)
x=array([[ 93.,  81.,  30., ...,  41.,  18.,  13.],
       [ 81.,  30.,  15., ...,  18.,  13.,   2.],
       [ 30.,  15.,   4., ...,  13.,   2.,   2.],
       ...,
       [ 99.,  74.,  50., ..., 156., 108.,  88.],
       [ 74.,  50.,  33., ..., 108.,  88.,  81.],
       [ 50.,  33.,  16., ...,  88.,  81.,  49.]], dtype=float32)
pickup_hours[:5]=[Timestamp('2023-01-02 00:00:00'), Timestamp('2023-01-02 02:00:00'), Timestamp('2023-01-02 03:00:00'), Timestamp('2023-01-02 05:00:00'), Timestamp('2023-01-02 06:00:00')]


In [10]:
features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(n_features))])
features_one_location

Unnamed: 0,rides_previous_24_hour,rides_previous_23_hour,rides_previous_22_hour,rides_previous_21_hour,rides_previous_20_hour,rides_previous_19_hour,rides_previous_18_hour,rides_previous_17_hour,rides_previous_16_hour,rides_previous_15_hour,...,rides_previous_10_hour,rides_previous_9_hour,rides_previous_8_hour,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour
0,93.0,81.0,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,...,106.0,120.0,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0
1,81.0,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,...,120.0,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0
2,30.0,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,...,104.0,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0,2.0
3,15.0,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,103.0,...,65.0,39.0,35.0,32.0,41.0,18.0,13.0,2.0,2.0,2.0
4,4.0,4.0,4.0,12.0,12.0,23.0,37.0,41.0,103.0,97.0,...,39.0,35.0,32.0,41.0,18.0,13.0,2.0,2.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,146.0,104.0,99.0,74.0,50.0,33.0,16.0,17.0,3.0,1.0,...,69.0,67.0,73.0,97.0,106.0,107.0,109.0,96.0,107.0,156.0
666,104.0,99.0,74.0,50.0,33.0,16.0,17.0,3.0,1.0,1.0,...,67.0,73.0,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0
667,99.0,74.0,50.0,33.0,16.0,17.0,3.0,1.0,1.0,1.0,...,73.0,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0,88.0
668,74.0,50.0,33.0,16.0,17.0,3.0,1.0,1.0,1.0,2.0,...,97.0,106.0,107.0,109.0,96.0,107.0,156.0,108.0,88.0,81.0


In [11]:
targets_one_location = pd.DataFrame(y, columns=['target_rides_next_hour'])
targets_one_location

Unnamed: 0,target_rides_next_hour
0,2.0
1,2.0
2,2.0
3,1.0
4,5.0
...,...
665,108.0
666,88.0
667,81.0
668,49.0


In [12]:
from tqdm import tqdm

def transform_ts_data_into_features_target(
        ts_data: pd.DataFrame,
        input_seq_len: int,
        step_size: int
) -> pd.DataFrame:
    """
    Slices and transposes data from time-series format into features and target format
    """
    assert set(ts_data.columns) == {'pickup_hour', 'ride_count', 'pickup_location_id'}

    location_ids = ts_data['pickup_location_id'].unique()
    featrues = pd.DataFrame()
    targets = pd.DataFrame()

    for location_id in tqdm(location_ids):
        ts_data_one_location = ts_data.loc[ts_data.pickup_location_id == location_id, ['pickup_hour', 'ride_count']]
        indexes = get_cutoff_indexes(ts_data_one_location, input_seq_len, step_size)
        n_examples = len(indexes)
        x = np.ndarray(shape=(n_examples, input_seq_len), dtype=np.float32)
        y = np.ndarray(shape=(n_examples), dtype=np.float32)
        pickup_hours = []
        for i, index in enumerate(indexes):
            x[i, :] = ts_data_one_location.iloc[index[0]:index[1]]['ride_count'].values
            y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
            pickup_hours.append(ts_data_one_location.iloc[index[1]]['pickup_hour'])
        
        features_one_location = pd.DataFrame(x, columns=[f'rides_previous_{i+1}_hour' for i in reversed(range(input_seq_len))])
        features_one_location['pickup_hour'] = pickup_hours
        features_one_location['pickup_location_id'] = location_id
        targets_one_location = pd.DataFrame(y, columns=['target_rides_next_hour'])
        featrues = pd.concat([featrues, features_one_location])
        targets = pd.concat([targets, targets_one_location])
    featrues.reset_index(drop=True, inplace=True)
    targets.reset_index(drop=True, inplace=True)
    return featrues, targets['target_rides_next_hour']


In [13]:
features, targets = transform_ts_data_into_features_target(ts_data, input_seq_len=24*7*1, step_size=24)
print(f'{features.shape=}')
print(f'{targets.shape=}')

  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  featrues = pd.concat([featrues, features_one_location])
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1]:index[2]]['ride_count'].values
  y[i] = ts_data_one_location.iloc[index[1

features.shape=(1817, 170)
targets.shape=(1817,)



