In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from datetime import datetime
import pickle
from src.commons import generate_data, methods

## Read Data

In [2]:
raw_data_path = methods.file_path('metro-trips-2025-q1.csv', dir_name='data_raw')
df = pd.read_csv(raw_data_path)
print("Data shape:", df.shape)
display(df.head())

Data shape: (95916, 15)


Unnamed: 0,trip_id,duration,start_time,end_time,start_station,start_lat,start_lon,end_station,end_lat,end_lon,bike_id,plan_duration,trip_route_category,passholder_type,bike_type
0,475609834,5,1/1/2025 0:12,1/1/2025 0:17,3030,34.051941,-118.24353,4491,34.04744,-118.24794,13668,30,One Way,Monthly Pass,standard
1,475609846,7,1/1/2025 0:12,1/1/2025 0:19,4558,34.025688,-118.395302,4569,34.02655,-118.408463,30021,30,One Way,Monthly Pass,electric
2,475609903,11,1/1/2025 0:13,1/1/2025 0:24,4212,33.988129,-118.471741,4206,33.998341,-118.461014,14923,30,One Way,Monthly Pass,standard
3,475609904,11,1/1/2025 0:13,1/1/2025 0:24,4212,33.988129,-118.471741,4206,33.998341,-118.461014,26704,30,One Way,Monthly Pass,electric
4,475610048,13,1/1/2025 0:27,1/1/2025 0:40,4472,34.092602,-118.28093,4509,34.101639,-118.309174,14790,30,One Way,Monthly Pass,standard


In [3]:
df.isna().sum()
df = df.dropna(subset=['start_lat', 'start_lon'])

In [4]:
# kmeans clustering

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=7, random_state=0)
df['start_cluster'] = kmeans.fit_predict(df[['start_lat', 'start_lon']])

fig = px.scatter_map(df, lat="start_lat", lon="start_lon", color='start_cluster')
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [5]:
df = df[df['start_cluster'] == 4]
print(f"Shape after using rows in cluster 4: {df.shape}")
def separte_date_time(datetime_str):
    date, time = datetime_str.split(' ')
    return date, datetime.strptime(time, "%H:%M").time()

df['day'], df['time'] = zip(*df['start_time'].map(separte_date_time))
df.head()

df['day'].unique().size

df = df[df['time'] >= datetime.strptime("12:00", "%H:%M").time()]
# plot histogram of time
import plotly.express as px
df.sort_values(by='time', inplace=True)
fig = px.histogram(df, x='time', nbins=24)
fig.show()

df = df[['day', 'time']]


Shape after using rows in cluster 4: (9761, 16)


In [6]:
df_grouped = df.groupby('day')['time'].apply(list).reset_index()
df_grouped.head()

def times_to_intervals(times: list[datetime.time]):
    intervals = []
    for i in range(1, len(times)):
        delta = times[i].hour * 60 + times[i].minute - (times[i-1].hour * 60 + times[i-1].minute)
        assert delta >= 0, "Times should be sorted"
        if delta == 0 or delta == 1:
            delta = 1.1  # to avoid zero intervals
            continue
        intervals.append(delta)
    return intervals

df_grouped['intervals'] = df_grouped['time'].apply(lambda x: times_to_intervals(sorted(x)))
df_grouped['h'] = 0.5
df_grouped['c'] = 25
df_grouped['travel_time'] = 120
df_grouped['total'] = df_grouped['intervals'].apply(len)
df_grouped.head()
df_grouped = df_grouped[['intervals', 'h', 'c', 'travel_time', 'total']]
df_grouped.head()

Unnamed: 0,intervals,h,c,travel_time,total
0,"[21, 60, 16, 2, 4, 10, 3, 10, 14, 39, 2, 10, 2...",0.5,25,120,27
1,"[12, 12, 20, 4, 10, 3, 3, 5, 9, 2, 24, 25, 6, ...",0.5,25,120,44
2,"[13, 18, 14, 16, 5, 7, 2, 3, 2, 16, 15, 17, 3,...",0.5,25,120,26
3,"[68, 27, 26, 6, 17, 63, 75, 13, 30, 7, 23, 34,...",0.5,25,120,14
4,"[19, 17, 3, 6, 3, 18, 8, 3, 4, 13, 11, 2, 18, ...",0.5,25,120,37


In [7]:
bicycle_data = df_grouped.to_dict(orient='records')
print(bicycle_data)

# validate
for entry in bicycle_data:
    intervals = entry['intervals']
    assert len(intervals) > 5, "Each day should have more than 5 entries"
    assert sum(intervals[3:]) > entry['travel_time'], "Sum of intervals should be greater than travel time"
    assert intervals.count(0) == 0, "Intervals should not contain zero"
    assert intervals.count(1) == 0, "Intervals should not contain one"
    
    
import pickle
bicycle_data_path = methods.file_path('bicycle_data.pkl', dir_name='data/non_gamma_bicyle_test')
with open(bicycle_data_path, 'wb') as f:
    pickle.dump(df_grouped, f)

[{'intervals': [21, 60, 16, 2, 4, 10, 3, 10, 14, 39, 2, 10, 23, 43, 6, 16, 5, 20, 13, 73, 4, 4, 16, 40, 2, 15, 59], 'h': 0.5, 'c': 25, 'travel_time': 120, 'total': 27}, {'intervals': [12, 12, 20, 4, 10, 3, 3, 5, 9, 2, 24, 25, 6, 8, 15, 14, 4, 7, 4, 23, 3, 6, 5, 3, 7, 8, 13, 7, 25, 18, 9, 8, 4, 15, 5, 10, 19, 7, 26, 20, 22, 9, 24, 20], 'h': 0.5, 'c': 25, 'travel_time': 120, 'total': 44}, {'intervals': [13, 18, 14, 16, 5, 7, 2, 3, 2, 16, 15, 17, 3, 28, 68, 24, 41, 5, 8, 11, 15, 40, 16, 23, 32, 120], 'h': 0.5, 'c': 25, 'travel_time': 120, 'total': 26}, {'intervals': [68, 27, 26, 6, 17, 63, 75, 13, 30, 7, 23, 34, 53, 85], 'h': 0.5, 'c': 25, 'travel_time': 120, 'total': 14}, {'intervals': [19, 17, 3, 6, 3, 18, 8, 3, 4, 13, 11, 2, 18, 22, 7, 13, 20, 22, 39, 31, 3, 7, 39, 13, 6, 14, 20, 6, 7, 7, 7, 26, 11, 20, 5, 17, 124], 'h': 0.5, 'c': 25, 'travel_time': 120, 'total': 37}, {'intervals': [10, 25, 2, 6, 20, 28, 5, 29, 4, 16, 2, 3, 11, 6, 10, 24, 10, 6, 32, 29, 31, 15, 35, 12, 7, 29, 183, 31, 