# Cluster Assignement

In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
from itertools import groupby
import matplotlib.pyplot as plt
from custom_functions.gtfs_methods import get_schedule
import ruptures as rpt 
pd.set_option('display.max_columns', None)

In [3]:
headway_t = pd.read_pickle("data/computed/theoretical_schedule.pkl")

calendar_dates = pd.read_csv('data/timetables/gtfs3Sept/calendar_dates.txt', sep = ",")
calendar = pd.read_csv('data/timetables/gtfs3Sept/calendar.txt', sep = ",")
trips = pd.read_csv('data/timetables/gtfs3Sept/trips.txt', sep = ",")
stops = pd.read_csv('data/timetables/gtfs3Sept/stops.txt', sep = ",")
stop_times = pd.read_csv('data/timetables/gtfs3Sept/stop_times.txt', sep = ",")
routes = pd.read_csv('data/timetables/gtfs3Sept/routes.txt', sep = ",")

In [4]:
sample_test = get_schedule(
    headway_t,
    stops,
    25,
    1,
    stop_name= "GEORGES HENRI"
)

In [5]:
def segmentation_model(serie: pd.Series, penalty, model) -> list:
    serie = np.array(serie.dropna().to_list())
    PELT = rpt.Pelt(model = model).fit(serie)
    PELT_res = PELT.predict(pen = penalty)
    return PELT_res

def cluster_assgnement(clusters:list[int]) -> list[int]:
    clstrs = []
    cluster_index = 0
    for i in range(len(clusters)):
        if i == 0:
            clstrs.append([cluster_index for x in range(clusters[i])])
        else:
            clstrs.append([cluster_index for x in range(clusters[i-1], clusters[i])])
        cluster_index += 1
    clusters_flatten = ["cluster_" + str(0)]+ ["cluster_" + str(cl) for sublist in clstrs for cl in sublist]
    return clusters_flatten

def compute_clusters(timetable:pd.DataFrame, cluster_computation_col_name:str="headway_min", cluster_col_name:str="clusters", penalty:int=3, model:str="rbf"):
    breakpoints = segmentation_model(timetable.sort_values(by="time_seconds")[cluster_computation_col_name], penalty, model)
    temp = timetable.sort_values(by="time_seconds")
    temp[cluster_col_name] = cluster_assgnement(breakpoints)
    return temp

In [6]:
def assess_regularity(timetable:pd.DataFrame, threshold:int= 12, aggregation_method:str = "median", cluster_computation_col_name:str="headway_min", cluster_col_name:str="clusters", col_reg_name:str="regularity") -> pd.DataFrame:
    agg = timetable.groupby(cluster_col_name)[cluster_computation_col_name].agg(aggregation_method)
    temp = timetable.merge(
        agg,
        "left",
        cluster_col_name,   
    ).rename(
        columns={
            cluster_computation_col_name + "_x" : cluster_computation_col_name,
            cluster_computation_col_name + "_y" : "cluster_agg_value"
        }
    )
    temp[col_reg_name] = np.where(
        temp["cluster_agg_value"] <= threshold,
        1,
        0
    )
    return temp

In [7]:
sample_test_function = assess_regularity(compute_clusters(sample_test[0]))

In [8]:
sample_test_function

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,pickup_type,drop_off_type,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,route_id,service_id,trip_headsign,direction_id,block_id,shape_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,time_seconds,headway_min,clusters,cluster_agg_value,regularity
0,113305249237261000,05:33:12,05:33:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8943276,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,19992,,cluster_0,8.0,1
1,113305269237261000,05:49:12,05:49:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8942438,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,20952,16.0,cluster_0,8.0,1
2,113305200237261000,06:08:12,06:08:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8942437,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,22092,19.0,cluster_0,8.0,1
3,113306701237261000,06:16:12,06:16:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8943486,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,22572,8.0,cluster_0,8.0,1
4,113306702237261000,06:24:12,06:24:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8942439,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,23052,8.0,cluster_0,8.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141,113305274237261000,24:11:12,24:11:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8943331,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,87072,15.0,cluster_5,15.0,0
142,113305367237261000,24:26:12,24:26:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8943276,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,87972,15.0,cluster_5,15.0,0
143,113305262237261000,24:38:12,24:38:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BUYL,1,8942438,025t0275,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,88692,12.0,cluster_5,15.0,0
144,113305368237261000,24:46:12,24:46:12,0811,7,0,0,,GEORGES HENRI,,50.843453,4.405598,,,0,69.0,25,237261000,BOONDAEL GARE,1,8942437,025t0277,25,BOONDAEL GARE - ROGIER,,0,,991F36,FFFFFF,89172,8.0,cluster_5,15.0,0


In [7]:
serie_test = np.array(sample_test[0].sort_values(by="time_seconds")["headway_min"].dropna().to_list())  
PELT = rpt.Pelt(model="rbf").fit(serie_test)
PELT_res = PELT.predict(pen=3)
fig = px.scatter(y=serie_test)
for line in PELT_res:
    fig.add_vline(x=line)
fig.show()

Les clusters ont l'air bon sur tableau, et on peut trouver tous nos clusters mtn. Juste le graphique fait n'importe quoi, on (avec Abdelmoumen) sais pas trop ce qui se passe ? 

In [1]:
sample_test_function

NameError: name 'sample_test_function' is not defined

In [46]:
fig = px.scatter(
    x=sample_test_function.sort_values("time_seconds")["arrival_time"], 
    y=sample_test_function.sort_values("time_seconds")["headway_min"],
    color= sample_test_function.sort_values("time_seconds")["clusters"],
    symbol = sample_test_function.sort_values("time_seconds")["regularity"])
fig.show()