In [50]:
# ! pip install causalnex

In [3]:
import warnings
warnings.filterwarnings("ignore")  # silence warnings
import pandas as pd
from datetime import datetime,date
from causalnex.structure import StructureModel
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE
from causalnex.network import BayesianNetwork
from causalnex.structure.notears import from_pandas
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler

from IPython.display import Image
import plotly.express as px
import pickle

In [4]:
import sys
sys.path.append("../scripts/")
from utils import label_encoder
# cleaner = DataCleaner()

In [8]:
data  = pd.read_csv('../data/df_merged.csv')
# clean_df = pd.read_csv('../data/clean_merged_df.csv')

In [9]:
df=data.copy()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1555718 entries, 0 to 1555717
Data columns (total 22 columns):
 #   Column                Non-Null Count    Dtype         
---  ------                --------------    -----         
 0   order_id              1555718 non-null  int64         
 1   driver_id             1555718 non-null  int64         
 2   driver_action         1555718 non-null  object        
 3   lat                   1555718 non-null  float64       
 4   lng                   1555718 non-null  float64       
 5   Trip Start Time       1555718 non-null  datetime64[ns]
 6   Trip End Time         1555718 non-null  datetime64[ns]
 7   Date                  1555718 non-null  object        
 8   Year                  1555718 non-null  int64         
 9   Month                 1555718 non-null  int64         
 10  Day                   1555718 non-null  int64         
 11  WeekOfYear            1555718 non-null  int64         
 12  Day of Week           1555718 non-null  in

In [11]:
# convert "Trip Start Time" and "Trip End Time"  objects into timestamps
df["Trip Start Time"] =pd.to_datetime(df["Trip Start Time"])
df["Trip End Time"] =pd.to_datetime(df["Trip End Time"])

In [82]:
# df.columns

In [18]:
df["hour"]=df["Trip Start Time"].apply(lambda x: pd.to_datetime(x).hour)

In [26]:
# check where trip duration is > 10
duration_df=df[df["Duration_Minutes"] >=10]
duration_df["Duration_Minutes"].count()

1476804

In [28]:
# check where trip duration is > 10
duration_df_less_10=df[df["Duration_Minutes"] < 10]
duration_df_less_10["Duration_Minutes"].count()

78914

In [29]:
df["fulfilled"] = (df["driver_action"] == "accepted") & (df["Duration_Minutes"] >=10)

In [59]:
# !  pip install plotly==5.10.0

` Label encode non numeric features`

In [31]:
label_encoder(df)

Unnamed: 0,order_id,driver_id,driver_action,lat,lng,Trip Start Time,Trip End Time,Date,Year,Month,...,Is Weekend,dayofweek,Duration_Minutes,Trip_Origin_lat,Trip_Origin_lng,Trip_Destination_lat,Trip_Destination_lng,Distance,hour,fulfilled
0,392001,243828,0,6.602207,3.270465,275,123,123,2021,7,...,0,4,4.0,6.601042,3.276634,6.450107,3.391615,20.984319,9,0
1,392001,243588,1,6.592097,3.287445,275,123,123,2021,7,...,0,4,4.0,6.601042,3.276634,6.450107,3.391615,20.984319,9,0
2,392001,243830,1,6.596133,3.281784,275,123,123,2021,7,...,0,4,4.0,6.601042,3.276634,6.450107,3.391615,20.984319,9,0
3,392001,243539,1,6.596142,3.280526,275,123,123,2021,7,...,0,4,4.0,6.601042,3.276634,6.450107,3.391615,20.984319,9,0
4,392001,171653,1,6.609232,3.288800,275,123,123,2021,7,...,0,4,4.0,6.601042,3.276634,6.450107,3.391615,20.984319,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555713,517948,243774,1,6.469036,3.566877,25150,25133,25133,2021,9,...,0,0,103.0,6.443374,3.552312,6.500608,3.598194,8.112927,11,0
1555714,517948,245447,1,6.425431,3.527358,25150,25133,25133,2021,9,...,0,0,103.0,6.443374,3.552312,6.500608,3.598194,8.112927,11,0
1555715,517948,239866,1,6.440013,3.525378,25150,25133,25133,2021,9,...,0,0,103.0,6.443374,3.552312,6.500608,3.598194,8.112927,11,0
1555716,517948,243774,1,6.469036,3.566877,25150,25133,25133,2021,9,...,0,0,103.0,6.443374,3.552312,6.500608,3.598194,8.112927,11,0


` Visualizations`

`Plot the origin trip of 500 samples`

In [58]:
df.to_csv('../data/df_merged_clean.csv',index=False)

In [7]:
df= pd.read_csv('../data/df_merged_clean.csv')

In [81]:
# df.head(1)

In [8]:
df_trip_origin=(df.groupby(['Trip_Origin_lng','Trip_Origin_lat'],as_index=False).agg(Count=('order_id', 'count'))).sort_values(by='Count',ascending=False)


In [10]:
df_trip_origin.reset_index().head(1)

Unnamed: 0,index,Trip_Origin_lng,Trip_Origin_lat,Count
0,7210,3.368881,6.52744,7080


In [12]:
# ! pip install --upgrade nbformat

In [11]:

fig = px.scatter_mapbox(df_trip_origin.head(100),lat='Trip_Origin_lat',
                lon='Trip_Origin_lng',
                 hover_name="Count",
                 color="Count",
                 height=600,size="Count")

fig.update_layout(mapbox_style="stamen-terrain")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

`Plot the destination trips of 500 samples`

In [14]:
df_trip_dest=(df.groupby(['Trip_Destination_lng','Trip_Destination_lat'],as_index=False).agg(Count=('order_id', 'count'))).sort_values(by='Count',ascending=False)

In [17]:
fig = px.scatter_mapbox(df_trip_dest.head(500),lat='Trip_Destination_lat',
                lon='Trip_Destination_lng',
                 hover_name="Count",
                 color="Count",
                 height=600,size="Count")

fig.update_layout(mapbox_style="stamen-terrain")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

fig.show()

In [34]:
sample_df= df.sample(500)

In [62]:
sample_df.dtypes

order_id                  int64
driver_id                 int64
driver_action             int64
lat                     float64
lng                     float64
Trip Start Time           int64
Trip End Time             int64
Date                      int64
Year                      int64
Month                     int64
Day                       int64
WeekOfYear                int64
Day of Week               int64
Is Weekend                int64
dayofweek                 int64
Duration_Minutes        float64
Trip_Origin_lat         float64
Trip_Origin_lng         float64
Trip_Destination_lat    float64
Trip_Destination_lng    float64
Distance                float64
hour                      int64
fulfilled                 int64
dtype: object

In [41]:
sm = from_pandas(sample_df)

In [42]:

sm.edges

OutEdgeView([('order_id', 'driver_id'), ('order_id', 'driver_action'), ('order_id', 'lat'), ('order_id', 'lng'), ('order_id', 'Trip Start Time'), ('order_id', 'Trip End Time'), ('order_id', 'Date'), ('order_id', 'Year'), ('order_id', 'Month'), ('order_id', 'Day'), ('order_id', 'WeekOfYear'), ('order_id', 'Day of Week'), ('order_id', 'Is Weekend'), ('order_id', 'dayofweek'), ('order_id', 'Duration_Minutes'), ('order_id', 'Trip_Origin_lat'), ('order_id', 'Trip_Origin_lng'), ('order_id', 'Trip_Destination_lat'), ('order_id', 'Trip_Destination_lng'), ('order_id', 'Distance'), ('order_id', 'hour'), ('order_id', 'fulfilled'), ('driver_id', 'order_id'), ('driver_id', 'driver_action'), ('driver_id', 'lat'), ('driver_id', 'lng'), ('driver_id', 'Trip Start Time'), ('driver_id', 'Trip End Time'), ('driver_id', 'Date'), ('driver_id', 'Year'), ('driver_id', 'Month'), ('driver_id', 'Day'), ('driver_id', 'WeekOfYear'), ('driver_id', 'Day of Week'), ('driver_id', 'Is Weekend'), ('driver_id', 'dayofwee

In [85]:
viz = plot_structure(
    sm,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
    prog='fdp',
)
Image(viz.draw(format='png'))

Warning: 
            Pygraphviz not installed. Also make sure you have the system-level
            ``graphviz`` requirement installed.

            Alternatively, you can visualise your graph using the networkx.draw
            functionality:
            >>> sm = StructureModel()
            >>> fig, ax = plt.subplots()
            >>> nx.draw_circular(sm, ax=ax)
            >>> fig.show()
            

In [None]:
sm_enh = from_pandas(df, tabu_edges=[("distance_diff","driver_loc")],tabu_parent_nodes=["driver_action"],tabu_child_nodes=["date","hour"], w_threshold=0.7)


In [None]:
viz = plot_structure(
    sm_enh,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
    prog='fdp'
)
Image(viz.draw(format='png'))

In [None]:
sm_enh.add_edge("distance_diff", "driver_action")
sm_enh.add_edge("date", "driver_action")
sm_enh.add_edge("date", "holiday")
sm_enh.add_edge("hour", "driver_action")
sm_enh.add_edge("request_loc", "distance_diff")
sm_enh.add_edge("driver_loc", "distance_diff")
sm_enh.add_edge("distance_diff", "driver_action")

In [None]:
viz = plot_structure(
    sm_enh,
    graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
    prog='fdp'
)
Image(viz.draw(format='png'))