In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.float_format', '{:.2f}'.format)

# Driver

In [None]:
driver_df = pd.read_csv('data/raw/drivers.csv')
driver_df.head()

In [None]:
driver_df.dtypes

In [None]:
# Check portions of vehicle types
vehicle_type_counts = driver_df['vehicle_type'].value_counts(normalize=True)
print("Vehicle Type Proportions:")
print(vehicle_type_counts)

In [None]:
# Check their joined_date distribution
driver_df['joined_date'] = pd.to_datetime(driver_df['joined_date'])
joined_date_stats = driver_df['joined_date'].describe()
print("\nJoined Date Statistics:")
print(joined_date_stats)

# Orders

In [None]:
order_df = pd.read_csv('data/raw/orders.csv')
order_df.head()

In [None]:
order_df.dtypes

In [None]:
# change created_at to datetime type
order_df['created_at'] = pd.to_datetime(order_df['created_at'])

In [None]:
# check hour of day distribution
order_df['hour_of_day'] = order_df['created_at'].dt.hour
hour_of_day_counts = order_df['hour_of_day'].value_counts().sort_index()
print("\nHour of Day Distribution:")
print(hour_of_day_counts)

In [None]:
# check object description
order_df.describe(include=['object'])

In [None]:
# check numeric description
order_df.describe()

In [None]:
# plot histogram of order amounts
plt.figure(figsize=(10, 6))
sns.histplot(order_df['cod_amount'], bins=30, kde=True)
plt.title('Distribution of Order Amounts')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.show()

In [None]:
# plot histogram of shipping fees
plt.figure(figsize=(10, 6))
sns.histplot(order_df['shipping_fee'], bins=30, kde=True)
plt.title('Distribution of Shipping Fees')
plt.xlabel('Shipping Fee')
plt.ylabel('Frequency')
plt.show()

In [None]:
# plot histogram of distances
plt.figure(figsize=(10, 6))
sns.histplot(order_df['distance_km'], bins=30, kde=True)
plt.title('Distribution of Distances')
plt.xlabel('Distance (km)')
plt.ylabel('Frequency')
plt.show()

# Interactions

In [None]:
interaction_df = pd.read_csv('data/raw/interaction_logs.csv')
interaction_df.head()

In [None]:
interaction_df.dtypes

In [None]:
interaction_df['offered_at'] = pd.to_datetime(interaction_df['offered_at'])

In [None]:
# check number of accepted and rejected offers
offer_status_counts = interaction_df['is_accepted'].value_counts()
print("\nOffer Status Counts:")
print(offer_status_counts)

In [None]:
# plot driver_distance_to_pickup distribution
plt.figure(figsize=(10, 6))
sns.histplot(interaction_df['driver_distance_to_pickup'], bins=30, kde=True)
plt.title('Distribution of Driver Distance to Pickup')
plt.xlabel('Distance to Pickup (km)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# plot driver_fatigue_index distribution
plt.figure(figsize=(10, 6))
sns.histplot(interaction_df['driver_fatigue_index'], bins=30, kde=True)
plt.title('Distribution of Driver Fatigue Index')
plt.xlabel('Fatigue Index')
plt.ylabel('Frequency')
plt.show()

In [None]:
# merge interaction_df with order_df to get more context
order_df = pd.read_csv('data/raw/orders.csv')

merged_df = interaction_df.merge(order_df, on='order_id', how='left')
merged_df.head()

In [None]:
# compare accepted/rejected rate of each service type
service_type_acceptance = merged_df.groupby('service_type')['is_accepted'].mean()
print("\nAcceptance Rate by Service Type:")
print(service_type_acceptance)

In [None]:
# compare cod_amount for accepted vs rejected offers
cod_amount_comparison = merged_df.groupby('is_accepted')['cod_amount'].describe()
print("\nCOD Amount Comparison by Offer Status:")
cod_amount_comparison

In [None]:
# compare accepted/rejected rate of requested_vehicle_type
vehicle_type_acceptance = merged_df.groupby('requested_vehicle_type')['is_accepted'].mean()
print("\nAcceptance Rate by Requested Vehicle Type:")
print(vehicle_type_acceptance)

In [None]:
# get order that get only rejected offers
rejected_orders = merged_df.groupby('order_id')['is_accepted'].max()
only_rejected_orders = rejected_orders[rejected_orders == 0].index.tolist()
only_rejected_df = merged_df[merged_df['order_id'].isin(only_rejected_orders)]
len(only_rejected_df['order_id'].unique())

In [None]:
# get orders that does not appear in interaction logs
all_order_ids = set(order_df['order_id'].unique())
interacted_order_ids = set(interaction_df['order_id'].unique())
non_interacted_order_ids = all_order_ids - interacted_order_ids
non_interacted_orders_df = order_df[order_df['order_id'].isin(non_interacted_order_ids)]
non_interacted_orders_df.head()

In [None]:
merged_df.dtypes

# Crafted

In [None]:
df = pd.read_csv('data/processed/feature_data.csv')
df.head()

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df.columns