In [1]:
import pandas as pd
import numpy as np
from causalnex.structure import StructureModel



In [2]:
from causalnex.plots import plot_structure
from causalnex.network import BayesianNetwork

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import train_test_split
from causalnex.plots import plot_structure, NODE_STYLE, EDGE_STYLE

import matplotlib.pyplot as plt
from causalnex.structure.notears import from_pandas
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [4]:
df = pd.read_csv('safe.csv')
df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,order_id,driver_id,driver_action,lat,lng,Origin Lat,Origin Lng,trip_distance,driver_to_order_distance,Trip Start Time,Trip End Time,time_taken,speed,weekday_or_weekend
0,0,0,392001,243828,accepted,6.602207,3.270465,6.601042,3.276634,20.984319,0.694264,2021-07-01 09:30:59,2021-07-01,217.0,0.096702,Weekday
1,1,1,392001,243588,rejected,6.592097,3.287445,6.601042,3.276634,0.0,1.551694,1970-01-01 00:00:00,1970-01-01,0.0,,Weekday
2,2,2,392001,243830,rejected,6.596133,3.281784,6.601042,3.276634,0.0,0.786777,1970-01-01 00:00:00,1970-01-01,0.0,,Weekday


In [5]:
df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0', 'Trip Start Time', 'Trip End Time'], errors='ignore')

df.head(2)
# df =df.fillna(df.mean())
# df = df.replace([float('inf'), float('-inf')], 1e10)


Unnamed: 0,order_id,driver_id,driver_action,lat,lng,Origin Lat,Origin Lng,trip_distance,driver_to_order_distance,time_taken,speed,weekday_or_weekend
0,392001,243828,accepted,6.602207,3.270465,6.601042,3.276634,20.984319,0.694264,217.0,0.096702,Weekday
1,392001,243588,rejected,6.592097,3.287445,6.601042,3.276634,0.0,1.551694,0.0,,Weekday


In [6]:
label_encoder = LabelEncoder()
df['driver_action'] = label_encoder.fit_transform(df['driver_action'])
df['weekday_or_weekend'] = label_encoder.fit_transform(df['weekday_or_weekend'])

In [7]:
scaler = MinMaxScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
train_df, holdout_df = train_test_split(df, test_size=0.2, random_state=42)

scaled_train_df = pd.DataFrame(scaler.fit_transform(train_df), columns=train_df.columns)
scaled_holdout_df = pd.DataFrame(scaler.transform(holdout_df), columns=holdout_df.columns)


In [8]:
driver_action_relations =[
    ('weekday_or_weekend', 'driver_action'),
    ('driver_to_order_distance', 'driver_action'),
    ('trip_distance', 'driver_action'),
    ('driver_to_order_distance', 'trip_distance')

]

In [22]:
scaled_train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244574 entries, 0 to 1244573
Data columns (total 12 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   order_id                  1244574 non-null  float64
 1   driver_id                 1244574 non-null  float64
 2   driver_action             1244574 non-null  float64
 3   lat                       1244574 non-null  float64
 4   lng                       1244574 non-null  float64
 5   Origin Lat                1244574 non-null  float64
 6   Origin Lng                1244574 non-null  float64
 7   trip_distance             1244574 non-null  float64
 8   driver_to_order_distance  1244574 non-null  float64
 9   time_taken                1244574 non-null  float64
 10  speed                     20670 non-null    float64
 11  weekday_or_weekend        1244574 non-null  float64
dtypes: float64(12)
memory usage: 113.9 MB


In [21]:
scaled_holdout_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311144 entries, 0 to 311143
Data columns (total 12 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   order_id                  311144 non-null  float64
 1   driver_id                 311144 non-null  float64
 2   driver_action             311144 non-null  float64
 3   lat                       311144 non-null  float64
 4   lng                       311144 non-null  float64
 5   Origin Lat                311144 non-null  float64
 6   Origin Lng                311144 non-null  float64
 7   trip_distance             311144 non-null  float64
 8   driver_to_order_distance  311144 non-null  float64
 9   time_taken                311144 non-null  float64
 10  speed                     5201 non-null    float64
 11  weekday_or_weekend        311144 non-null  float64
dtypes: float64(12)
memory usage: 28.5 MB


In [9]:
sm = StructureModel()

sm.add_edges_from(driver_action_relations)


In [11]:

viz = plot_structure(
    sm,
    # graph_attributes={"scale": "0.5"},
    all_node_attributes=NODE_STYLE.WEAK,
    all_edge_attributes=EDGE_STYLE.WEAK,
)

viz.show('b.html')




b.html


In [12]:
from causalnex.network import BayesianNetwork
from causalnex.inference import InferenceEngine


In [13]:
bn = BayesianNetwork(sm)


In [14]:
bn.nodes

['weekday_or_weekend',
 'driver_action',
 'driver_to_order_distance',
 'trip_distance']

In [42]:
scaled_df.head(3)


Unnamed: 0,order_id,driver_id,driver_action,lat,lng,Origin Lat,Origin Lng,trip_distance,driver_to_order_distance,time_taken,weekday_or_weekend
0,0.0,0.967839,0.0,0.149144,0.035652,0.775034,0.302652,0.030856,0.001058,0.000665,0.0
1,0.0,0.965932,1.0,0.141327,0.038774,0.775034,0.302652,0.0,0.002365,0.0,0.0
2,0.0,0.967854,1.0,0.144448,0.037733,0.775034,0.302652,0.0,0.001199,0.0,0.0


In [76]:
train_df, holdout_df = train_test_split(scaled_df, test_size=0.2, random_state=42)

# print(train_df.tail(3))
train_df.describe()



Unnamed: 0,driver_action,trip_distance,driver_to_order_distance,weekday_or_weekend
count,800.0,800.0,800.0,800.0
mean,0.98,0.00048,0.002443,0.0
std,0.140088,0.003994,0.000976,0.0
min,0.0,0.0,0.000175,0.0
25%,1.0,0.0,0.001807,0.0
50%,1.0,0.0,0.002436,0.0
75%,1.0,0.0,0.002907,0.0
max,1.0,0.051793,0.004559,0.0


In [61]:
bn = bn.fit_node_states_and_cpds(train_df)


In [46]:
ie = InferenceEngine(bn.fit_node_states_and_cpds(scaled_df))


In [95]:
for i in range(0,1):
  ie.do_intervention('weekday_or_weekend',i)
  print(f"predicted_outcome_{i} = {ie.query()['driver_action']}")

  ie.reset_do("weekday_or_weekend")

predicted_outcome_0 = {0.0: 0.01800000000000001, 1.0: 0.9820000000000008}


In [96]:
for i in range(0,1):
  ie.do_intervention('trip_distance',i)
  print(f"predicted_outcome_{i} = {ie.query()['driver_action']}")

ie.reset_do("trip_distance")

predicted_outcome_0 = {0.0: 0.005500000000000001, 1.0: 0.9945000000000006}


In [98]:
for i in range(0,10):
  ie.do_intervention('driver_to_order_distance',i)
  print(f"predicted_outcome_{i} = {ie.query()['driver_action']}")

  ie.reset_do("driver_to_order_distance")

ValueError: The cpd for the provided observation must sum to 1