In [1]:
import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

sns.set_style('darkgrid')

sns.set_context('paper', font_scale = 1.4)

from plotly import express as exp, graph_objects as go, io as pio

pio.templates.default = 'ggplot2'

from plotly.subplots import make_subplots

from pandas_utils.pandas_utils_2 import *

import ipywidgets as widgets

from IPython.display import display

import warnings


warnings.filterwarnings("ignore", category=DeprecationWarning)

warnings.filterwarnings("ignore", category=FutureWarning)
    

In [2]:
offers_df = pd.read_csv("./datasets/data_offers.csv")

orders_df = pd.read_csv("./datasets/data_orders.csv")


offers_df.shape, orders_df.shape

((334363, 2), (10716, 8))

In [3]:
offers_df.head(2)

Unnamed: 0,order_gk,offer_id
0,3000579625629,300050936206
1,3000627306450,300052064651


In [4]:
orders_df.head(2)

Unnamed: 0,order_datetime,origin_longitude,origin_latitude,m_order_eta,order_gk,order_status_key,is_driver_assigned_key,cancellations_time_in_seconds
0,18:08:07,-0.978916,51.456173,60.0,3000583041974,4,1,198.0
1,20:57:32,-0.950385,51.456843,,3000583116437,4,0,128.0


In [5]:
# orders_df = pd.merge(orders_df, offers_df, on = 'order_gk', how = 'left')

In [6]:
# orders_df = orders_df[
#     ['order_datetime', 'origin_longitude', 'origin_latitude', 'm_order_eta', 'order_gk', 'offer_id', 'order_status_key', 'is_driver_assigned_key', 'cancellations_time_in_seconds']
# ]


orders_df = orders_df[
    ['order_datetime', 'origin_longitude', 'origin_latitude', 'm_order_eta', 'order_gk', 'order_status_key', 'is_driver_assigned_key', 'cancellations_time_in_seconds']
]

In [7]:
orders_df = orders_df.rename({"cancellations_time_in_seconds": "cancellation_time_in_seconds"}, axis = 1)

In [8]:
show_nan(orders_df)

2024-01-12 19:41:09.389 
  command:

    streamlit run D:\py_projects\realty_rental_analysis\venv\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [9]:
orders_df['order_status'] = orders_df.order_status_key.apply(lambda x: "cancelled_by_client" if x == 4 else "cancelled_by_system")

In [10]:
pd.DataFrame(orders_df[['order_status_key', 'order_status']].value_counts() / len(orders_df))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
order_status_key,order_status,Unnamed: 2_level_1
4,cancelled_by_client,0.681878
9,cancelled_by_system,0.318122


In [11]:
# Offers are not applied on 9% of the total cancelled rides.

In [12]:
# Cancellation time in seconds --> Cancellation time in seconds by client.

In [13]:
orders_df['m_order_eta_is_null'] = orders_df.m_order_eta.apply(lambda x: 1 if 'nan' in str(x).lower() else 0)

In [14]:
# orders_df.is_driver_assigned_key = orders_df.is_driver_assigned_key.apply(lambda x: str(x))

In [15]:
pd.DataFrame(orders_df[['m_order_eta_is_null', 'is_driver_assigned_key']].value_counts()) / len(orders_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
m_order_eta_is_null,is_driver_assigned_key,Unnamed: 2_level_1
1,0,0.737402
0,1,0.262598


In [16]:
# m_order_eta only when a driver is assigned.

In [17]:
orders_df = orders_df.drop(columns = ['order_status_key'])

In [18]:
orders_df.columns.values.tolist()

['order_datetime',
 'origin_longitude',
 'origin_latitude',
 'm_order_eta',
 'order_gk',
 'is_driver_assigned_key',
 'cancellation_time_in_seconds',
 'order_status',
 'm_order_eta_is_null']

In [19]:
pd.DataFrame(orders_df[['m_order_eta_is_null', 'order_status']].value_counts() / len(orders_df))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
m_order_eta_is_null,order_status,Unnamed: 2_level_1
1,cancelled_by_client,0.41956
1,cancelled_by_system,0.317842
0,cancelled_by_client,0.262318
0,cancelled_by_system,0.00028


In [20]:
stacked_bar_chart_ci_2(orders_df, 'm_order_eta_is_null', 'order_status')

In [21]:
# When a driver is assigned, most of the ride cancellations are done by the client.

In [22]:
orders_df[['cancellation_time_in_seconds', "m_order_eta"]].corr()

Unnamed: 0,cancellation_time_in_seconds,m_order_eta
cancellation_time_in_seconds,1.0,-0.038853
m_order_eta,-0.038853,1.0


In [23]:
orders_df.columns.values.tolist()

['order_datetime',
 'origin_longitude',
 'origin_latitude',
 'm_order_eta',
 'order_gk',
 'is_driver_assigned_key',
 'cancellation_time_in_seconds',
 'order_status',
 'm_order_eta_is_null']

In [24]:
# orders_df['is_no_offer'] = orders_df['offer_id'].apply(lambda x: '1' if 'nan' in str(x).lower() else '0')

unique_offer_orders = offers_df.order_gk.unique()

orders_df['is_no_offer'] = orders_df.order_gk.apply(lambda x: 1 if x not in unique_offer_orders else 0)

In [25]:
pd.DataFrame(orders_df[['order_status', 'is_no_offer']].value_counts() / len(orders_df))

Unnamed: 0_level_0,Unnamed: 1_level_0,count
order_status,is_no_offer,Unnamed: 2_level_1
cancelled_by_client,0,0.532381
cancelled_by_system,0,0.177772
cancelled_by_client,1,0.149496
cancelled_by_system,1,0.140351


In [26]:
stacked_bar_chart_ci_2(orders_df, 'order_status', 'is_no_offer')

In [27]:
orders_df.columns.values.tolist()

['order_datetime',
 'origin_longitude',
 'origin_latitude',
 'm_order_eta',
 'order_gk',
 'is_driver_assigned_key',
 'cancellation_time_in_seconds',
 'order_status',
 'm_order_eta_is_null',
 'is_no_offer']

In [28]:
'''

Filters:

order_datetime (slider)

is_no_offer,

is_driver_assigned_key,

order_status


Calcs:

m_order_eta kdeplot

cancellation_time_in_seconds kdeplot



'''

'\n\nFilters:\n\norder_datetime (slider)\n\nis_no_offer,\n\nis_driver_assigned_key,\n\norder_status\n\n\nCalcs:\n\nm_order_eta kdeplot\n\ncancellation_time_in_seconds kdeplot\n\n\n\n'

In [29]:
'''

When a driver is not assigned:


Cancelled by client: 53 percent


Cancelled by system: 47 percent



When a driver is assigned:


Cancelled by client: 99.94 percent


Cancelled by system: 0.06 percent


'''

'\n\nWhen a driver is not assigned:\n\n\nCancelled by client: 53 percent\n\n\nCancelled by system: 47 percent\n\n\n\nWhen a driver is assigned:\n\n\nCancelled by client: 99.94 percent\n\n\nCancelled by system: 0.06 percent\n\n\n'

In [30]:
# Seconds --> Minutes


orders_df['cancellation_time_in_minutes'] = orders_df.cancellation_time_in_seconds.apply(lambda x: x / 60)


orders_df['m_order_eta'] = orders_df.m_order_eta.apply(lambda x: x / 60)


orders_df['hour'] = orders_df.order_datetime.apply(lambda x: int(x.split(":")[0]))


In [31]:
orders_df.order_datetime

0        18:08:07
1        20:57:32
2        12:07:50
3        13:50:20
4        21:24:45
           ...   
10711    13:11:35
10712    13:13:55
10713    13:17:21
10714    13:16:28
10715    11:49:35
Name: order_datetime, Length: 10716, dtype: object