## PROJECT - Deployment ##

# Getaround Delay Analysis 🚗 #

In [13]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


In [None]:
DELAY_ANALYSIS_XL_FILE_PATH = "data/get_around_delay_analysis.xlsx"

chart_default_fillcolor='#d6a62b'

### Utility methods ###

In [15]:
# Constants
DECIMAL_FORMAT_2 = "%.2f"
def format_2_decimals(number) -> str:
    return (DECIMAL_FORMAT_2 % number)

def decimal_format_str(nb_decimals:int) -> str:
    return "{:."+str(nb_decimals)+"f}"

def format_decimals(number, nb_decimals:int) ->str:
    return decimal_format_str(nb_decimals).format(number)

In [16]:
delay_analysis_dtf = pd.read_excel(DELAY_ANALYSIS_XL_FILE_PATH)

In [17]:
delay_analysis_dtf.head(10)

Unnamed: 0,rental_id,car_id,checkin_type,state,delay_at_checkout_in_minutes,previous_ended_rental_id,time_delta_with_previous_rental_in_minutes
0,505000,363965,mobile,canceled,,,
1,507750,269550,mobile,ended,-81.0,,
2,508131,359049,connect,ended,70.0,,
3,508865,299063,connect,canceled,,,
4,511440,313932,mobile,ended,,,
5,511626,398802,mobile,ended,-203.0,,
6,511639,370585,connect,ended,-15.0,563782.0,570.0
7,512303,371242,mobile,ended,-44.0,,
8,512475,322502,mobile,canceled,,,
9,513434,256528,connect,ended,23.0,,


In [18]:
print(f'number of rows {delay_analysis_dtf.shape[0]}')

number of rows 21310


### Computing some figures like checkin_type distribution, distribution of rental states, rentals with previous rental,...

In [19]:
dtf = delay_analysis_dtf.groupby(['checkin_type']).size()*100/len(delay_analysis_dtf)
dtf = dtf.reset_index()
dtf.columns = ['checkin_type', 'proportion']
dtf

Unnamed: 0,checkin_type,proportion
0,connect,20.211168
1,mobile,79.788832


In [20]:
dtf = (
    delay_analysis_dtf['checkin_type']
    .value_counts(normalize=True)
    .mul(100)
    .round(2)
    .reset_index()
)
dtf

Unnamed: 0,checkin_type,proportion
0,mobile,79.79
1,connect,20.21


In [21]:
dtf = delay_analysis_dtf.groupby(['state']).size()*100/len(delay_analysis_dtf)
dtf = dtf.reset_index()
dtf.columns = ['state', 'proportion']
dtf

Unnamed: 0,state,proportion
0,canceled,15.321445
1,ended,84.678555


In [22]:
delay_analysis_delta_with_previous_dtf = delay_analysis_dtf.query("time_delta_with_previous_rental_in_minutes.notnull()")
print(delay_analysis_delta_with_previous_dtf.shape[0])

1841


In [23]:

print(delay_analysis_dtf.query("previous_ended_rental_id.notnull()").shape[0])

1841


### Which share of our owner’s revenue would potentially be affected by the feature? ###

The Proportion of rentals with a previous rental and with a delay < 12h with previous rentals helps answer this question

time_delta_with_previous_rental_in_minutes column holds this information

Difference in minutes between this rental planned start time and the previous rental planned end time (when lower than 12 hours, NULL if higher)

Statisticaly (based on history / dataset), for a given threshold (ie a minimum delay between two rentals ), the drivers who rented a car with a delay with previous rental planned checkout time inferior of the threshold
would not have been able to rent the car hence this proportion of rentals represents a potential drop of revenue.


In [24]:
delay_analysis_delta_with_previous_dtf = delay_analysis_dtf.query("time_delta_with_previous_rental_in_minutes.notnull()")

In [25]:
proportion_delay_analysis_delta_with_previous = delay_analysis_delta_with_previous_dtf.shape[0]/delay_analysis_dtf.shape[0]
print(f'proportion (%) of rentals potentially** affected by a minimum delay between two rentals : {format_decimals(100*proportion_delay_analysis_delta_with_previous,2)}')

proportion (%) of rentals potentially** affected by a minimum delay between two rentals : 8.64


### How many rentals would be affected by the feature depending on the threshold and scope we choose? ###

Vizualise the distribution of rentals based on time_delta_with_previous_rental_in_minutes (regardless scope)

Rentals affected by the threshold : if delay < threshold. 

Hence if delay2 < delay1 < threshold => rentals with delay2 also concerned, so cumulative histogram makes sense

#### 1. How many rentals would be affected by the feature depending on the threshold (regardless the scope) we choose? ####

In [26]:
n = 12  # Number of values
step = 60
custom_bins = np.arange(step, (n + 1) * step, step)

print(custom_bins)

hist, bin_edges = np.histogram(delay_analysis_delta_with_previous_dtf['time_delta_with_previous_rental_in_minutes'], bins=custom_bins, density=False)
cumulative_counts = np.cumsum(hist)

print("cumulative_counts")
print(cumulative_counts)


proportions = cumulative_counts / cumulative_counts[-1]  # Normalize to 1
# Use midpoints for x values
bin_midpoints = 0.5 * (np.array(bin_edges[:-1]) + np.array(bin_edges[1:]))

print("bin mid_points")
print(bin_midpoints)
#base = delay_analysis_delta_with_previous_dtf.shape[0]
#counts = proportions * base
#print("counts")
#print(counts)

fig = go.Figure()

# Primary Y-Axis (Proportion)
fig.add_trace(go.Scatter(
    x=bin_midpoints,
    y=proportions,
    name="Cumulative Proportion",
    yaxis="y1",
    mode="lines+markers"
))

# Secondary Y-Axis (Proportion × constant)
fig.add_trace(go.Scatter(
    x=bin_midpoints,
    y=cumulative_counts,
    name="Cumulative Count",
    yaxis="y2",
    mode="lines+markers",
    line=dict(dash='dot')
))

# Layout with dual y-axes
fig.update_layout(
    title="Cumulative Distribution of rentals affected by Threshold",
    xaxis=dict(title="Threshold - Time delta with previous rental (minutes)"),
    yaxis=dict(title="Proportion", tickformat=".0%"),
    yaxis2=dict(
        title="Count",
        overlaying="y",
        side="right"
    ),
    width=800,
    height=500,
    legend=dict(x=0.5, y=1.1, xanchor="center", orientation="h")
)

fig.show()

[ 60 120 180 240 300 360 420 480 540 600 660 720]
cumulative_counts
[ 265  469  600  705  769  825  889  967 1048 1173 1440]
bin mid_points
[ 90. 150. 210. 270. 330. 390. 450. 510. 570. 630. 690.]


With a threshold of 210 minutes, approximatively 600 rentals are concerned. It represents 40% of the rentals with a previous rental, and 40% * 8.64% of the whole population.

#### 2. How many rentals would be affected by the feature depending on the threshold and the scope we choose? ####

In [27]:
from plotly.subplots import make_subplots

n = 12  # Number of values
step = 60
custom_bins = np.arange(step, (n + 1) * step, step)
bin_midpoints = 0.5 * (custom_bins[:-1] + custom_bins[1:])

# Prepare subplot layout
scopes = sorted(delay_analysis_dtf["checkin_type"].unique())
n_rows = len(scopes)

fig = go.Figure()

# Add traces
for i, scope in enumerate(scopes):
    data = delay_analysis_delta_with_previous_dtf[delay_analysis_delta_with_previous_dtf["checkin_type"] == scope]["time_delta_with_previous_rental_in_minutes"]
    #counts, _ = np.histogram(data, bins=custom_bins, density=False)
    hist, _ = np.histogram(data, bins=custom_bins, density=False)
    #proportions = counts / counts.sum()
    #cumulative = np.cumsum(proportions)
    cumulative_counts = np.cumsum(hist)
    proportions = cumulative_counts / cumulative_counts[-1]  # Normalize to 1

    print("cumulative_counts")
    print(cumulative_counts)

    print("proportions")
    print(proportions)
    fig.add_trace(
        go.Scatter(
            x=bin_midpoints,
            y=proportions,
            mode='lines+markers',
            name=f"{scope} (Proportion)",
            yaxis="y1",
            showlegend=True,
        )
    )

    # Secondary Y-Axis (Proportion × constant)
    fig.add_trace(go.Scatter(
        x=bin_midpoints,
        y=cumulative_counts,
        name=f"{scope} (Count)",
        yaxis="y2",
        mode="lines+markers",
        line=dict(dash='dot')
    ),
    )

# Layout
fig.update_layout(
    height= 300 * n_rows,
    width = 800,
    yaxis=dict(title="Proportion", tickformat=".0%", range=[0, 1], side='left'),
    yaxis2=dict(
        title="Cumulative count",
        overlaying="y",
        side="right", showgrid=False
    ),
    title_text="Cumulative Distribution of rentals affected by Threshold",
    xaxis_title="Threshold - Time delta with previous rental (minutes)",
    yaxis_title="Cumulative Proportion",
    template="plotly_white",
    legend=dict(x=0.01, y=0.99),
)

fig.show()

cumulative_counts
[114 191 249 297 319 341 369 408 451 511 632]
proportions
[0.18037975 0.30221519 0.39398734 0.46993671 0.50474684 0.53955696
 0.58386076 0.64556962 0.71360759 0.8085443  1.        ]
cumulative_counts
[151 278 351 408 450 484 520 559 597 662 808]
proportions
[0.18688119 0.34405941 0.43440594 0.5049505  0.55693069 0.5990099
 0.64356436 0.69183168 0.73886139 0.81930693 1.        ]


In [28]:
from plotly.subplots import make_subplots

n = 12  # Number of values
step = 60
custom_bins = np.arange(step, (n + 1) * step, step)
bin_midpoints = 0.5 * (custom_bins[:-1] + custom_bins[1:])

# Prepare subplot layout
scopes = sorted(delay_analysis_dtf["checkin_type"].unique())
n_rows = len(scopes)

fig = make_subplots(
    rows=n_rows,
    cols=1,
    shared_xaxes=True,
    subplot_titles=[scope for scope in scopes]
)

# Add traces
for i, scope in enumerate(scopes):
    data = delay_analysis_delta_with_previous_dtf[delay_analysis_delta_with_previous_dtf["checkin_type"] == scope]["time_delta_with_previous_rental_in_minutes"]
    hist, _ = np.histogram(data, bins=custom_bins, density=False)
    cumulative_counts = np.cumsum(hist)
    proportions = cumulative_counts / cumulative_counts[-1]  # Normalize to 1

    fig.add_trace(
        go.Scatter(
            x=bin_midpoints,
            y=cumulative_counts,
            mode='lines+markers',
            name=scope,
            showlegend=True
        ),
        row=i+1,
        col=1
    )

# Layout
fig.update_layout(
    height=300 * n_rows,
    width=800,
    title_text="Cumulative Distribution of rentals affected by Threshold by scope",
    xaxis_title="Threshold - Time delta with previous rental (minutes)",
    yaxis_title="Cumulative count",
    template="plotly_white"
)

fig.show()


### How often are drivers late for the next check-in? How does it impact the next driver? ###


If we take a rental with previous_ended_rental_id not null, then if we look-up rental with rental_id equals to previous_ended_rental_id and with delay_at_checkout_in_minutes > 0, it means previous driver completed the check-out
after the planned check-out time. But it does not mean he was late for next check-in, it depends on time_delta_with_previous_rental_in_minutes (for the initial rental).
If delay_at_checkout_in_minutes (for previous rental) > 0  and delay_at_checkout_in_minutes (for previous rental) > time_delta_with_previous_rental_in_minutes (for the previous next rental), it means driver of previous rental was late for next check-in.


#### 1. How often are drivers late for the next check-in? ####

We are computing the proportion of drivers late for next check-in (number of drivers late for next-checkin divided by the number of drivers with a next check-in)

In [29]:
rentals_with_previous_rental = delay_analysis_dtf.loc[delay_analysis_dtf['previous_ended_rental_id'].notnull()]
print(f'Count of rentals with previous rental : {rentals_with_previous_rental.shape[0]}')
print(f'Proportion (%) of rentals with previous rental : {format_decimals(100*rentals_with_previous_rental.shape[0]/delay_analysis_dtf.shape[0],2)}')
rentals_with_delay_checkout_dtf = delay_analysis_dtf.loc[delay_analysis_dtf['delay_at_checkout_in_minutes'] > 0 ]

drivers_late_next_checking_dtf = pd.merge(rentals_with_delay_checkout_dtf,rentals_with_previous_rental, left_on=['rental_id'], right_on=['previous_ended_rental_id'], how='inner', suffixes=['_previous', '_next'])
drivers_late_next_checking_dtf = drivers_late_next_checking_dtf.loc[drivers_late_next_checking_dtf['delay_at_checkout_in_minutes_previous'] > drivers_late_next_checking_dtf['time_delta_with_previous_rental_in_minutes_next']]
print(f'proportion (%) of drivers late for next check-in : {format_decimals(100*drivers_late_next_checking_dtf.shape[0]/rentals_with_previous_rental.shape[0],2)}')
drivers_late_next_checking_dtf.head(5)

Count of rentals with previous rental : 1841
Proportion (%) of rentals with previous rental : 8.64
proportion (%) of drivers late for next check-in : 11.84


Unnamed: 0,rental_id_previous,car_id_previous,checkin_type_previous,state_previous,delay_at_checkout_in_minutes_previous,previous_ended_rental_id_previous,time_delta_with_previous_rental_in_minutes_previous,rental_id_next,car_id_next,checkin_type_next,state_next,delay_at_checkout_in_minutes_next,previous_ended_rental_id_next,time_delta_with_previous_rental_in_minutes_next
2,536343,369230,mobile,ended,780.0,,,536315,369230,mobile,ended,-191.0,536343.0,720.0
3,536978,378931,mobile,ended,271.0,,,544473,378931,mobile,ended,-4.0,536978.0,60.0
6,541444,397413,mobile,ended,901.0,,,543665,397413,mobile,ended,7.0,541444.0,30.0
7,544443,337047,mobile,ended,56.0,,,547047,337047,mobile,ended,-53.0,544443.0,30.0
8,545064,194343,connect,ended,74.0,,,547783,194343,connect,ended,-61.0,545064.0,30.0


#### 2.How does it impact the next driver? ####

When drivers are late for next check-in, do next drivers tend to reduce their rental duration / check-out in advance ? What are the proportion of next drivers cancelling their rental ?
Are scope / checkin types equally reprensented within drivers with late check-in ?
Does it impact next next rental ?
Next, we calculate some figures to help answer this questions comparing with the same figures calculated over the dataset of rentals with a previous/next rental

In [30]:
# 1. rental with Driver late for next_checkin 2. next rental 3. next next rental
next_drivers_with_next_drivers_dtf = pd.merge(drivers_late_next_checking_dtf, rentals_with_previous_rental, left_on='rental_id_next', right_on='previous_ended_rental_id', how='inner', suffixes=['','_next_next'])
print(next_drivers_with_next_drivers_dtf.shape[0])
#next_drivers_with_next_drivers_dtf.head(5)
next_driver_late_next_checkin_dtf = next_drivers_with_next_drivers_dtf.loc[(next_drivers_with_next_drivers_dtf['delay_at_checkout_in_minutes_next'] > 0) & (next_drivers_with_next_drivers_dtf['delay_at_checkout_in_minutes_next'] > next_drivers_with_next_drivers_dtf['time_delta_with_previous_rental_in_minutes'])]
next_driver_late_next_checkin_dtf.shape[0]

21


3

In [31]:
print('Within late drivers for next check-in')
print('-------'*10)
print(f'proportion (%) of next drivers returning car in advance : {format_decimals(100*drivers_late_next_checking_dtf.query("delay_at_checkout_in_minutes_next < 0").shape[0]/drivers_late_next_checking_dtf.shape[0],2)}')
print(f'proportion (%) of next drivers returning car late : {format_decimals(100*drivers_late_next_checking_dtf.query("delay_at_checkout_in_minutes_next > 0").shape[0]/drivers_late_next_checking_dtf.shape[0],2)}')
print(f'proportion (%) of next drivers cancelling their rental : {format_decimals(100*drivers_late_next_checking_dtf.query('state_next == "canceled"').shape[0]/drivers_late_next_checking_dtf.shape[0],2)}')
print()
grouped = drivers_late_next_checking_dtf.groupby(['checkin_type_previous']).size()*100/len(drivers_late_next_checking_dtf)
grouped = grouped.reset_index()
grouped.columns = ['scope', 'proportion']
#grouped.columns = ['proportion']
grouped.set_index(['scope'], inplace=True)
grouped

Within late drivers for next check-in
----------------------------------------------------------------------
proportion (%) of next drivers returning car in advance : 24.31
proportion (%) of next drivers returning car late : 53.21
proportion (%) of next drivers cancelling their rental : 16.97



Unnamed: 0_level_0,proportion
scope,Unnamed: 1_level_1
connect,31.651376
mobile,68.348624


In [32]:
next_drivers_returning_late = (
    drivers_late_next_checking_dtf.loc[drivers_late_next_checking_dtf['delay_at_checkout_in_minutes_next'] > 0]['checkin_type_previous']
    .value_counts(normalize=True)
    .mul(100).round(2).reset_index()
)
next_drivers_returning_late

Unnamed: 0,checkin_type_previous,proportion
0,mobile,79.31
1,connect,20.69


In [35]:
next_drivers_canceling_rental = (
    drivers_late_next_checking_dtf.loc[drivers_late_next_checking_dtf['state_next'] == 'canceled']['checkin_type_previous']
    .value_counts(normalize=True)
    .mul(100).round(2).reset_index()
)
next_drivers_returning_late

Unnamed: 0,checkin_type_previous,proportion
0,mobile,79.31
1,connect,20.69


In [36]:
print('Within all rentals with previous rentals')
print('-------'*10)
dtf = delay_analysis_dtf.loc[(delay_analysis_dtf['previous_ended_rental_id'].notnull())]
print("dtf len : ", len(dtf))
#& delay_analysis_dtf['delay_at_checkout_in_minutes'] < 0
print(f'proportion (%) of next drivers returning car in advance : {format_decimals(100*dtf.query("delay_at_checkout_in_minutes < 0").shape[0]/dtf.shape[0],2)}')
print(f'proportion (%) of next drivers returning car late : {format_decimals(100*dtf.query("delay_at_checkout_in_minutes > 0").shape[0]/dtf.shape[0],2)}')
print(f'proportion (%) of next drivers cancelling their rental : {format_decimals(100*dtf.query('state == "canceled"').shape[0]/dtf.shape[0],2)}')
print()
grouped = dtf.groupby(['checkin_type']).size()*100/len(dtf)
grouped = grouped.reset_index()
grouped.columns = ['scope', 'proportion']
grouped

Within all rentals with previous rentals
----------------------------------------------------------------------
dtf len :  1841
proportion (%) of next drivers returning car in advance : 37.97
proportion (%) of next drivers returning car late : 43.56
proportion (%) of next drivers cancelling their rental : 12.44



Unnamed: 0,scope,proportion
0,connect,44.160782
1,mobile,55.839218


#### How many problematic cases will it solve depending on the chosen threshold and scope? ####

We compute min_delay_between_rentals = delay_at_checkout_in_minutes_previous for drivers_late_next_checking_dtf dataset 
(time_delta_with_previous_rental_in_minutes_next + (delay_at_checkout_in_minutes_previous-time_delta_with_previous_rental_in_minutes_next))

Thus with this delay / threshold it solves this problematic case.

Then by computing cumulative distribution function based on this new field / threshold, it gives the number of problematic cases solved by threshold

In [44]:
drivers_late_next_checking_dtf['min_delay_between_rentals'] = drivers_late_next_checking_dtf['time_delta_with_previous_rental_in_minutes_next'] + (drivers_late_next_checking_dtf['delay_at_checkout_in_minutes_previous']-drivers_late_next_checking_dtf['time_delta_with_previous_rental_in_minutes_next'])
drivers_late_next_checking_dtf.head(10)

Unnamed: 0,rental_id_previous,car_id_previous,checkin_type_previous,state_previous,delay_at_checkout_in_minutes_previous,previous_ended_rental_id_previous,time_delta_with_previous_rental_in_minutes_previous,rental_id_next,car_id_next,checkin_type_next,state_next,delay_at_checkout_in_minutes_next,previous_ended_rental_id_next,time_delta_with_previous_rental_in_minutes_next,min_delay_between_rentals
2,536343,369230,mobile,ended,780.0,,,536315,369230,mobile,ended,-191.0,536343.0,720.0,780.0
3,536978,378931,mobile,ended,271.0,,,544473,378931,mobile,ended,-4.0,536978.0,60.0,271.0
6,541444,397413,mobile,ended,901.0,,,543665,397413,mobile,ended,7.0,541444.0,30.0,901.0
7,544443,337047,mobile,ended,56.0,,,547047,337047,mobile,ended,-53.0,544443.0,30.0,56.0
8,545064,194343,connect,ended,74.0,,,547783,194343,connect,ended,-61.0,545064.0,30.0,74.0
11,548646,282893,mobile,ended,201.0,,,539151,282893,mobile,canceled,,548646.0,30.0,201.0
15,557514,405433,mobile,ended,47.0,,,557845,405433,mobile,ended,-11.0,557514.0,0.0,47.0
16,558583,326578,mobile,ended,3.0,,,555821,326578,mobile,ended,1143.0,558583.0,0.0,3.0
18,562173,394024,mobile,ended,73.0,,,554062,394024,mobile,ended,-48.0,562173.0,0.0,73.0
19,562174,381499,connect,ended,73.0,,,568224,381499,connect,ended,-93.0,562174.0,0.0,73.0


In [50]:
drivers_late_next_checking_threshold_dtf = drivers_late_next_checking_dtf.loc[:,['min_delay_between_rentals', 'checkin_type_previous' ]].copy()
drivers_late_next_checking_threshold_dtf.head(5)
drivers_late_next_checking_threshold_dtf.rename(columns={'checkin_type_previous':'checkin_type'}, inplace=True)

In [51]:



n = 12  # Number of values
step = 60
custom_bins = np.arange(step, (n + 1) * step, step)
bin_midpoints = 0.5 * (custom_bins[:-1] + custom_bins[1:])

# Prepare subplot layout
scopes = sorted(drivers_late_next_checking_threshold_dtf["checkin_type"].unique())
n_rows = len(scopes)

fig = make_subplots(
    rows=n_rows,
    cols=1,
    shared_xaxes=True,
    subplot_titles=[scope for scope in scopes]
)

# Add traces
for i, scope in enumerate(scopes):
    data = drivers_late_next_checking_threshold_dtf[drivers_late_next_checking_threshold_dtf["checkin_type"] == scope]["min_delay_between_rentals"]
    hist, _ = np.histogram(data, bins=custom_bins, density=False)
    cumulative_counts = np.cumsum(hist)
    proportions = cumulative_counts / cumulative_counts[-1]  # Normalize to 1

    fig.add_trace(
        go.Scatter(
            x=bin_midpoints,
            y=cumulative_counts,
            mode='lines+markers',
            name=scope,
            showlegend=True
        ),
        row=i+1,
        col=1
    )

# Layout
fig.update_layout(
    height=300 * n_rows,
    width=800,
    title_text="Cumulative Distribution of rentals potentialy solved by Threshold",
    xaxis_title="Threshold - Time delta with previous rental (minutes)",
    yaxis_title="Cumulative count",
    template="plotly_white"
)

fig.show()