# Frequent Item Mining

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import geopandas as gpd
from tqdm import tqdm
import pandas as pd
import folium
import json
import ast



pd.set_option('display.max_columns', None)
pd.set_option('mode.chained_assignment', None)

In [2]:
res = pd.read_pickle("data/computed/matches_clean_EWT_delay.pkl")
tt = res[["route_short_name","mode"]].drop_duplicates()
del res

In [3]:
data = pd.read_pickle("data/computed/lines_growth_EWT.pkl")
data = data.merge(
    tt,
    "left",
    "route_short_name"
)

In [4]:
data 

Unnamed: 0,route_short_name,stop_name,date_normalized,direction_id,succession,EWT,smoothen_delay,growth,growth_cut,lab_growth,mode
0,1,GARE DE L'OUEST,07-09-2021,0,1.0,3.220386,3.220421,,,,M
1,1,BEEKKANT,07-09-2021,0,2.0,1.203381,1.185877,-2.034545,-2.034545,-1.0,M
2,1,ETANGS NOIRS,07-09-2021,0,3.0,1.071983,1.121525,-0.064352,-0.064352,-1.0,M
3,1,COMTE DE FLANDRE,07-09-2021,0,4.0,1.180626,1.225784,0.104259,0.104259,1.0,M
4,1,SAINTE-CATHERINE,07-09-2021,0,5.0,1.310846,1.040459,-0.185325,-0.185325,-1.0,M
...,...,...,...,...,...,...,...,...,...,...,...
28724,98,DORENT,18-09-2021,1,8.0,1.264277,1.138900,-0.060361,-0.060361,-1.0,B
28725,98,PATHE,18-09-2021,1,9.0,1.207296,1.263165,0.124265,0.124265,1.0,B
28726,98,MOZART,18-09-2021,1,10.0,1.383667,1.400844,0.137679,0.137679,1.0,B
28727,98,BOLLINCKX,18-09-2021,1,11.0,1.248472,1.227721,-0.173123,-0.173123,-1.0,B


In [5]:
data = data.groupby(["route_short_name", "date_normalized", "direction_id", "mode", "lab_growth"], as_index=False).apply(lambda x: pd.Series({"stops" : [x.stop_name.to_list()]}))

## Delay creators stops

In [6]:
gen = [t for x in data[(data.lab_growth == 1)]["stops"].to_list() for t in x ]
metro_del = [t for x in data[(data.lab_growth == 1) & (data["mode"] == "M")]["stops"].to_list() for t in x ]
tram_del = [t for x in data[(data.lab_growth == 1) & (data["mode"] == "T")]["stops"].to_list() for t in x ]
bus_del = [t for x in data[(data.lab_growth == 1) & (data["mode"] == "B")]["stops"].to_list() for t in x ]

In [7]:
config = [{"data" : metro_del, "type" : "M", "sup": 0.45}, {"data" : tram_del, "type" : "T", "sup": 0.05}, {"data" : bus_del, "type" : "B", "sup": 0.03}]


config_gen = [{"data" : gen, "type" : "M", "sup": 0.03}]

In [8]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

for _ in config:
    print("=============")
    print(_["type"])
    print("transaction encoder")
    te = TransactionEncoder()
    print("fitting the te")
    te_ary = te.fit(_["data"]).transform(_["data"])
    df_late = pd.DataFrame(te_ary, columns=te.columns_)
    print("Computing FP")
    fp_late = fpgrowth(df_late, min_support=_["sup"], use_colnames=True)
    _["stops_patterns"] = np.unique(np.array([t for k in fp_late.itemsets.apply(lambda x: list(x)) for t in k]))



M
transaction encoder
fitting the te
Computing FP
T
transaction encoder
fitting the te
Computing FP
B
transaction encoder
fitting the te
Computing FP


In [9]:
config[0]["stops_patterns"].shape, config[1]["stops_patterns"].shape, config[2]["stops_patterns"].shape

((0,), (62,), (25,))

In [10]:
config_gen[0]["stops_patterns"].shape

KeyError: 'stops_patterns'

In [11]:
plot = []

for item in config:
    plot.extend([{"mode" : item["type"], "stop_name" : x} for x in item["stops_patterns"]])

delay_stops = pd.DataFrame(plot)


### Plot stops

In [12]:
stops = pd.read_csv("data/timetables/gtfs3Sept/stops.txt")
coord = stops[["stop_name", "stop_lat", "stop_lon"]].drop_duplicates("stop_name")

In [13]:
delay_stops = delay_stops.merge(
    coord,
    "left",
    "stop_name"
)

In [14]:
delay_stops

Unnamed: 0,mode,stop_name,stop_lat,stop_lon
0,T,ABBAYE,50.818926,4.370739
1,T,ALBERT,50.821090,4.343888
2,T,ARAUCARIA,50.892871,4.363956
3,T,AZUR,50.866659,4.287128
4,T,BAILLI,50.827127,4.363428
...,...,...,...,...
82,B,STATUAIRES,50.803921,4.349047
83,B,TOUR ET TAXIS,50.864968,4.350239
84,B,TRONE,50.841633,4.364862
85,B,UZ BRUSSEL,50.886340,4.311669


In [15]:
px.set_mapbox_access_token("pk.eyJ1IjoibWpkYW91ZGkiLCJhIjoiY2xibm54OThyMGdyOTNvcnhqeTYyZmRuYiJ9.rfxe3z8triwA5yvV1XZA-A")
fig = px.scatter_mapbox(delay_stops, lat="stop_lat", lon="stop_lon", hover_name="stop_name",
                        color="mode", zoom=10)
fig.update_layout(mapbox_style="light")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



## Delay catcher

In [16]:
gen = [t for x in data[(data.lab_growth == -1)]["stops"].to_list() for t in x ]
metro_cat = [t for x in data[(data.lab_growth == -1) & (data["mode"] == "M")]["stops"].to_list() for t in x ]
tram_cat = [t for x in data[(data.lab_growth == -1) & (data["mode"] == "T")]["stops"].to_list() for t in x ]
bus_cat = [t for x in data[(data.lab_growth == -1) & (data["mode"] == "B")]["stops"].to_list() for t in x ]

In [17]:
config_cat = [{"data" : metro_cat, "type" : "M", "sup": 0.45}, {"data" : tram_cat, "type" : "T", "sup": 0.05}, {"data" : bus_cat, "type" : "B", "sup": 0.03}]


config_gen_cat = [{"data" : gen, "type" : "M", "sup": 0.03}]

In [18]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import fpgrowth

for _ in config_cat:
    print("=============")
    print(_["type"])
    print("transaction encoder")
    te = TransactionEncoder()
    print("fitting the te")
    te_ary = te.fit(_["data"]).transform(_["data"])
    df_late = pd.DataFrame(te_ary, columns=te.columns_)
    print("Computing FP")
    fp_late = fpgrowth(df_late, min_support=_["sup"], use_colnames=True)
    _["stops_patterns"] = np.unique(np.array([t for k in fp_late.itemsets.apply(lambda x: list(x)) for t in k]))

M
transaction encoder
fitting the te
Computing FP
T
transaction encoder
fitting the te
Computing FP
B
transaction encoder
fitting the te
Computing FP


In [19]:
config_cat[0]["stops_patterns"].shape, config_cat[1]["stops_patterns"].shape, config_cat[2]["stops_patterns"].shape

((1,), (37,), (20,))

In [20]:
plot = []

for item in config_cat:
    plot.extend([{"mode" : item["type"], "stop_name" : x} for x in item["stops_patterns"]])

delay_stops_cat = pd.DataFrame(plot)
delay_stops_cat = delay_stops_cat.merge(
    coord,
    "left",
    "stop_name"
)

In [21]:
delay_stops_cat

Unnamed: 0,mode,stop_name,stop_lat,stop_lon
0,M,ARTS-LOI,50.845476,4.369121
1,T,ALBERT,50.82109,4.343888
2,T,BERKENDAEL,50.818079,4.346047
3,T,BOETENDAEL,50.80664,4.343477
4,T,BOSSAERT-BASILIQ.,50.865077,4.320101
5,T,BROUSTIN,50.867911,4.326561
6,T,BUISSONNETS,50.891291,4.368867
7,T,CAMBRE-ETOILE,50.816372,4.376301
8,T,CARREFOUR STALLE,50.79673,4.321064
9,T,CHIEN VERT,50.834697,4.426884


In [22]:
px.set_mapbox_access_token("pk.eyJ1IjoibWpkYW91ZGkiLCJhIjoiY2xibm54OThyMGdyOTNvcnhqeTYyZmRuYiJ9.rfxe3z8triwA5yvV1XZA-A")
fig = px.scatter_mapbox(delay_stops_cat, lat="stop_lat", lon="stop_lon", hover_name="stop_name",
                        color="mode", zoom=10)
fig.update_layout(mapbox_style="light")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [23]:
delay_stops_cat["cat"] = "catcher"
delay_stops["cat"] = "delayer"

In [24]:
pd.concat([delay_stops, delay_stops_cat]).to_pickle("data/computed/freq_mining_EWT.pkl")

## Buses

In [25]:
Buses = pd.concat([delay_stops[delay_stops["mode"] == "B"], delay_stops_cat[delay_stops_cat["mode"] == "B"]])
Buses = Buses[~Buses.stop_name.isin(Buses[Buses.duplicated(subset="stop_name")].stop_name)]

In [26]:
px.set_mapbox_access_token("pk.eyJ1IjoibWpkYW91ZGkiLCJhIjoiY2xibm54OThyMGdyOTNvcnhqeTYyZmRuYiJ9.rfxe3z8triwA5yvV1XZA-A")
fig = px.scatter_mapbox(Buses, lat="stop_lat", lon="stop_lon", hover_name="stop_name",
                        color="cat", zoom=10)
fig.update_layout(mapbox_style="light")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



## Tram

In [27]:
tram = pd.concat([delay_stops[delay_stops["mode"] == "T"], delay_stops_cat[delay_stops_cat["mode"] == "T"]])
tram = tram[~tram.stop_name.isin(tram[tram.duplicated(subset="stop_name")].stop_name)]

px.set_mapbox_access_token("pk.eyJ1IjoibWpkYW91ZGkiLCJhIjoiY2xibm54OThyMGdyOTNvcnhqeTYyZmRuYiJ9.rfxe3z8triwA5yvV1XZA-A")
fig = px.scatter_mapbox(tram, lat="stop_lat", lon="stop_lon", hover_name="stop_name",
                        color="cat", zoom=10)
fig.update_layout(mapbox_style="light")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



## Metro

In [28]:
metro = pd.concat([delay_stops[delay_stops["mode"] == "M"], delay_stops_cat[delay_stops_cat["mode"] == "M"]])
metro = metro[~metro.stop_name.isin(metro[metro.duplicated(subset="stop_name")].stop_name)]

px.set_mapbox_access_token("pk.eyJ1IjoibWpkYW91ZGkiLCJhIjoiY2xibm54OThyMGdyOTNvcnhqeTYyZmRuYiJ9.rfxe3z8triwA5yvV1XZA-A")
fig = px.scatter_mapbox(metro, lat="stop_lat", lon="stop_lon", hover_name="stop_name",
                        color="cat", zoom=10)
fig.update_layout(mapbox_style="light")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.

