In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

In [2]:
#load data 
df=pd.read_csv("dynamic_pricing.csv")
df.head(2)

Unnamed: 0,Number_of_Riders,Number_of_Drivers,Location_Category,Customer_Loyalty_Status,Number_of_Past_Rides,Average_Ratings,Time_of_Booking,Vehicle_Type,Expected_Ride_Duration,Historical_Cost_of_Ride
0,90,45,Urban,Silver,13,4.47,Night,Premium,90,284.257273
1,58,39,Suburban,Silver,72,4.06,Evening,Economy,43,173.874753


## EDA

In [3]:
df.describe

<bound method NDFrame.describe of      Number_of_Riders  Number_of_Drivers Location_Category  \
0                  90                 45             Urban   
1                  58                 39          Suburban   
2                  42                 31             Rural   
3                  89                 28             Rural   
4                  78                 22             Rural   
..                ...                ...               ...   
995                33                 23             Urban   
996                84                 29             Urban   
997                44                  6          Suburban   
998                53                 27          Suburban   
999                78                 63             Rural   

    Customer_Loyalty_Status  Number_of_Past_Rides  Average_Ratings  \
0                    Silver                    13             4.47   
1                    Silver                    72             4.06   
2          

In [4]:
fig= px.scatter(df, x='Expected_Ride_Duration',
                y='Historical_Cost_of_Ride',
                title="Expected_Ride_Duration vs Historical_Cost_of_Ride",
                trendline='ols')
fig.show()

#### historical cost of rides based on the vehicle type

In [5]:
fig=px.box(df, x='Vehicle_Type',
           y='Historical_Cost_of_Ride',
           title="Historical Cost of Rides Distribution based on Vehicle Type")
fig.show()

Reading off the median values (middle lines in the box) the cost of a premium vehicle (about 400) is higher than the cost of the economy vehicle (about 330), this is also expected considering factors such as comfort and number of seats the car. (Some ride hailing apps cap the maximum number of riders that can seat a car based on vehicle type).

### Historical Cost of Rides Distribution based on Vehicle Type

In [6]:
fig=px.box(df, x='Time_of_Booking',
           y='Historical_Cost_of_Ride',
           title="Historical Cost of Rides Distribution based on Vehicle Type")
fig.show()

Reading off the median values (middle lines in the box) the historical cost of a ride is high in the morning (about 380) and highest in the afternoon (about 390) but gets lower in the evening (about 360) and is lowest in the night(about 330), this is expected considering factors such as amount of vehicle traffic during the day: heavy vehicle traffic in the morning and afternoon hours but less traffic in the evening and night time.

### Heatmap

### Surge Pricing Model

In [7]:
high_demand_percentile=75
low_demand_percentile=25

df['demand_multiplier']=np.where(df['Number_of_Riders']>np.percentile(df['Number_of_Riders'], high_demand_percentile),
                                 df['Number_of_Riders']/np.percentile(df['Number_of_Riders'], high_demand_percentile),
                                 df['Number_of_Riders']>np.percentile(df['Number_of_Riders'], low_demand_percentile),
                                 )
#calculating supply_multiplier based on percentile fir high and low mileage
high_supply_percentile=75
low_supply_percentile=25

df['supply_multiplier']=np.where(df['Number_of_Riders']>np.percentile(df['Number_of_Riders'], high_supply_percentile),
                                 np.percentile(df['Number_of_Riders'], high_supply_percentile)/df['Number_of_Riders'],
                                 np.percentile(df['Number_of_Riders'], low_supply_percentile)/df['Number_of_Riders'],
                                 )
demand_threshold_high=1.2
demand_threshold_low=0.8
supply_threshold_high=0.8
supply_threshold_low=1.2

#calculate adjusted_ride_cost for dynamic pricing
df['adjusted_ride_cost']=df['Historical_Cost_of_Ride']*(
    np.maximum(df['demand_multiplier'], demand_threshold_low)*
    np.maximum(df['supply_multiplier'], demand_threshold_high)
)

We start by defining percentiles for high and low demand levels. We then calculate the demand multiplier by comparing the number of riders to these percentiles. If riders exceed the high-demand percentile, the multiplier is the number of riders divided by this percentile. If they fall below the low-demand percentile, the multiplier is calculated similarly.

Next, we establish percentiles for high and low supply levels. If the number of drivers exceeds the low-supply percentile, the supply multiplier is the high-supply percentile divided by the number of drivers. If drivers fall below the low-supply percentile, the multiplier is the low-supply percentile divided by the number of drivers.

Finally, we calculate the adjusted ride cost for dynamic pricing by multiplying the historical cost by the greater of the demand multiplier or a lower threshold, and by the greater of the supply multiplier or an upper threshold. This method ensures the adjusted ride cost reflects both demand and supply effects while using thresholds to control price changes.

Now we can calculate the profit percentage with this dynamic pricing strategy.

In [8]:
df['profit_percentage']=((df['adjusted_ride_cost']-df['Historical_Cost_of_Ride'])/df['Historical_Cost_of_Ride']) *100
#profitable rides
profitable_ride=df[df['profit_percentage']>0]
#loss rides
loss_rides=df[df['profit_percentage']<0]

import plotly.graph_objects

#counts
profitable_count=len(profitable_ride)
loss_count=len(loss_rides)

labels=['Profitable Rides','Loss Rides']
values=[profitable_count,loss_count]

fig=go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.6)])
fig.update_layout(title='Profitability of Rides(Dynamic Pricing vs Historic Pricing)')
fig.show()

From the donut chart above, we can see a resounding 89.7% of all rides implementing dynamic pricing strategy turned out to be profitable rides.

We can now look at the relationship between the expected ride duration and the adjusted cost of the ride based on the dynamic pricing strategy:

In [9]:
# Relationship between Expected Ride Duration VS Cost_Ride_Cost
fig=px.scatter(df, x='Expected_Ride_Duration',
               y='adjusted_ride_cost',
               title='Expected Ride Duration VS Cost_Ride_Cost',
               trendline='ols')
fig.show()

The scatter plot above shows a wider distribution of ride costs for certain durations, indicating that multiple variables are influencing the ride pricing.

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def data_preprocessing_pipeline(df):
    # Identifying numeric and categorical features
    numeric_features = df.select_dtypes(include=['float', 'int']).columns
    categorical_features = df.select_dtypes(include=['object']).columns

    # Handling missing values in numeric features
    df[numeric_features] = df[numeric_features].fillna(df[numeric_features].mean())

    # Handling outliers using IQR method
    for feature in numeric_features:
        Q1 = df[feature].quantile(0.25)
        Q3 = df[feature].quantile(0.75)
        IQR = Q3 - Q1

        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)

        df[feature] = np.where((df[feature] < lower_bound) | (df[feature] > upper_bound),
                               df[feature].mean(), df[feature])

    # Handling missing values in categorical features
    df[categorical_features] = df[categorical_features].fillna(df[categorical_features].mode().iloc[0])

    return df


In [15]:
#Mapping vehicle type to a numerical feature
df['Vehicle_Type']=df['Vehicle_Type'].map({"Premium":1, "Economy":0})

In [16]:
#Mapping time_of_booking to a numerical feature
df["Time_of_Booking"]=df["Time_of_Booking"].map({"Afternoon":0, "Evening":1,"Morning":2,"Night":3})

Training the model to predict the cost of a ride

In [None]:
#spliting data to train the model
from sklearn.model_selection import train_test_split
x=np.array