In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import holidays
import requests
from geopy.distance import geodesic
from geopy.geocoders import Nominatim

In [2]:
completed_df = pd.read_csv('../data/nb.csv')
df = pd.read_csv('../data/driver_locations_during_request.csv')

In [3]:
completed_df.columns

Index(['Trip ID', 'Trip Origin', 'Trip Destination', 'Trip Start Time',
       'Trip End Time'],
      dtype='object')

In [4]:
df.columns

Index(['id', 'order_id', 'driver_id', 'driver_action', 'lat', 'lng',
       'created_at', 'updated_at'],
      dtype='object')

In [5]:
df = df.drop(['updated_at', 'created_at'], axis=1)

Feature Extraction

In [6]:
# Convert timestamp to datetime
completed_df['datetime'] = pd.to_datetime(completed_df['Trip Start Time'])

In [7]:
# Create hour of day
completed_df['hour'] = completed_df['datetime'].dt.hour

In [8]:
# Create weekend vs. weekday feature
completed_df['is_weekend'] = completed_df['datetime'].dt.weekday >= 5

In [9]:
# Add holiday feature
nigeria_holidays = holidays.CountryHoliday('NG')

In [10]:
# Create a function to check for holidays
def is_holiday(date):
    return date in nigeria_holidays

In [11]:
completed_df

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,datetime,hour,is_weekend
0,391996,"6.508813001668548,3.37740316890347","6.650969799999999,3.3450307",2021-07-01 07:28:04,2021-07-01 07:29:37,2021-07-01 07:28:04,7.0,False
1,391997,"6.4316714,3.4555375","6.4280814653326,3.4721885847586",2021-07-01 06:38:04,2021-07-01 07:07:28,2021-07-01 06:38:04,6.0,False
2,391998,"6.631679399999999,3.3388976","6.508324099999999,3.3590397",2021-07-01 06:21:02,2021-07-01 07:02:23,2021-07-01 06:21:02,6.0,False
3,391999,"6.572757200000001,3.3677082","6.584881099999999,3.3614073",2021-07-01 07:16:07,2021-07-01 07:29:42,2021-07-01 07:16:07,7.0,False
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False
...,...,...,...,...,...,...,...,...
536015,1637696,"6.448218499999999,3.4772075","6.437787399999999,3.481670199999999",2021-12-30 20:35:06,2021-12-30 21:02:59,2021-12-30 20:35:06,20.0,False
536016,1637702,"6.442320899999999,3.4736868","6.436589333407897,3.5559738188407835",2021-12-30 20:48:13,2021-12-30 21:43:49,2021-12-30 20:48:13,20.0,False
536017,1637704,"6.4281982,3.492248","6.448088500000001,3.4775747",2021-12-30 20:51:45,2021-12-30 21:41:32,2021-12-30 20:51:45,20.0,False
536018,1637705,"6.5869296,3.3632966","6.637906899999999,3.3339515",2021-12-30 20:48:50,2021-12-30 21:08:28,2021-12-30 20:48:50,20.0,False


In [12]:
merged_df = pd.merge(completed_df, df, left_on='Trip ID', right_on='order_id')

In [14]:
merged_df = merged_df.drop(['id', 'order_id'], axis=1)

In [15]:
merged_df.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,datetime,hour,is_weekend,driver_id,driver_action,lat,lng
0,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243828,accepted,6.602207,3.270465
1,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243588,rejected,6.592097,3.287445
2,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243830,rejected,6.596133,3.281784
3,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243539,rejected,6.596142,3.280526
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,171653,rejected,6.609232,3.2888


In [16]:
merged_df['driver_action'].unique()

array(['accepted', 'rejected'], dtype=object)

In [17]:
merged_df['is_weekend'].unique()

array([False,  True])

In [18]:
merged_df['Driver Location'] = merged_df.apply(lambda row: f"{row['lat']}, {row['lng']}", axis=1)

In [19]:
merged_df['driver_clientdistance'] = merged_df.apply(lambda row: geodesic(
    (row['Driver Location']),
    (row['Trip Origin'])
).kilometers, axis=1)

In [20]:
merged_df = merged_df.drop(['lat', 'lng'], axis=1)

In [21]:
completed_df['Trip distance_km'] = completed_df.apply(lambda row: geodesic(
    (row['Trip Origin']),
    (row['Trip Destination'])
).kilometers, axis=1)

In [22]:
completed_df['Trip Start Time'] = pd.to_datetime(completed_df['Trip Start Time'])
completed_df['Trip End Time'] = pd.to_datetime(completed_df['Trip End Time'])
completed_df['duration_hours'] = completed_df.apply(lambda row: (row['Trip End Time'] - row['Trip Start Time']).total_seconds() / 3600, axis=1)

completed_df['speed_kmh'] = completed_df['Trip distance_km'] / completed_df['duration_hours']

In [23]:
completed_df.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,datetime,hour,is_weekend,Trip distance_km,duration_hours,speed_kmh
0,391996,"6.508813001668548,3.37740316890347","6.650969799999999,3.3450307",2021-07-01 07:28:04,2021-07-01 07:29:37,2021-07-01 07:28:04,7.0,False,16.123451,0.025833,624.13359
1,391997,"6.4316714,3.4555375","6.4280814653326,3.4721885847586",2021-07-01 06:38:04,2021-07-01 07:07:28,2021-07-01 06:38:04,6.0,False,1.884305,0.49,3.845521
2,391998,"6.631679399999999,3.3388976","6.508324099999999,3.3590397",2021-07-01 06:21:02,2021-07-01 07:02:23,2021-07-01 06:21:02,6.0,False,13.822393,0.689167,20.056676
3,391999,"6.572757200000001,3.3677082","6.584881099999999,3.3614073",2021-07-01 07:16:07,2021-07-01 07:29:42,2021-07-01 07:16:07,7.0,False,1.511034,0.226389,6.674507
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,20.984319,0.060278,348.126952


In [24]:
merged_df.head()

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time,datetime,hour,is_weekend,driver_id,driver_action,Driver Location,driver_clientdistance
0,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243828,accepted,"6.6022066, 3.2704649",0.694264
1,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243588,rejected,"6.5920972, 3.2874447",1.551694
2,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243830,rejected,"6.5961334, 3.2817841",0.786777
3,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,243539,rejected,"6.5961416, 3.2805263",0.692054
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36,2021-07-01 09:30:59,9.0,False,171653,rejected,"6.6092317, 3.2887999",1.621848


In [25]:
import os
from dotenv import load_dotenv

In [26]:
api_key = os.getenv("WEATHER_API_KEY")

In [27]:
# Function to get weather data 
def get_weather(lat, lng, datetime):
    API_KEY = 'WEATHER_API_KEY'
    url = f'https://api.weather.com/v1/geocode/{lat}/{lng}/observations/historical.json?apiKey={API_KEY}&startDate={datetime.strftime("%Y%m%d")}&endDate={datetime.strftime("%Y%m%d")}&units=e'
    response = requests.get(url)
    data = response.json()
    # Check if it was raining
    rain = any(obs['wx_phrase'].lower() == 'rain' for obs in data['observations'])
    return rain

In [30]:
completed_df['speed_kmh'].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    5.343680e+05
mean              inf
std               NaN
min      0.000000e+00
25%      6.876905e+00
50%      1.113760e+01
75%      1.681571e+01
max               inf
Name: speed_kmh, dtype: float64