In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pyhive import presto
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")
import my_func

presto_conn = presto.connect(
    host='bi-presto.serving.data.production.internal',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [3]:
start_date = '2022-01-03'
end_date = '2022-04-03'

In [5]:
# Rides

def get_rides(start_date, end_date):
    
    print("fetching rides data for : ", start_date, " to ", end_date)
    rides_query = """select captain_id,
                            date_format(date_parse(order_date,'%Y-%m-%d'),'%Y%m%d') as order_date,
                            sum(case when service_obj_service_name = 'Link' then 1 else 0 end) as link_orders,
                            sum(case when service_obj_service_name in ('Delivery', 'Zomato') then 1 else 0 end) as delivery_orders,
                            count(*) as total_orders
                        from orders.order_logs_snapshot
                        where order_date >= '{sd}'
                            and order_date <= '{ed}'
                            and order_status = 'dropped'
                            and (spd_fraud_flag != true or spd_fraud_flag is null)
                            and lower(service_obj_service_name) not like '%auto%'
                            and lower(order_type) not like '%auto%'
                            and service_obj_city_display_name in ('Hyderabad','Chennai','Bangalore','Delhi','Jaipur','Kolkata')
                        group by 1,2 """.format(sd=start_date, ed=end_date)
    #print(rides_query)
    tmp = pd.read_sql(rides_query, presto_conn)
    print(" fetched rides for captains", len(tmp))

    return tmp


df_rides = pd.DataFrame()

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 0]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

for day in date_list: 
    rides = get_rides(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'))
    rides['week'] = pd.to_datetime(day).strftime('%Y-%V')
    df_rides = pd.concat([df_rides, rides])

fetching rides data for :  2022-01-03  to  2022-01-09
 fetched rides for captains 243894
fetching rides data for :  2022-01-10  to  2022-01-16
 fetched rides for captains 225385
fetching rides data for :  2022-01-17  to  2022-01-23
 fetched rides for captains 240580
fetching rides data for :  2022-01-24  to  2022-01-30
 fetched rides for captains 257151
fetching rides data for :  2022-01-31  to  2022-02-06
 fetched rides for captains 251084
fetching rides data for :  2022-02-07  to  2022-02-13
 fetched rides for captains 250377
fetching rides data for :  2022-02-14  to  2022-02-20
 fetched rides for captains 264740
fetching rides data for :  2022-02-21  to  2022-02-27
 fetched rides for captains 286067
fetching rides data for :  2022-02-28  to  2022-03-06
 fetched rides for captains 296931
fetching rides data for :  2022-03-07  to  2022-03-13
 fetched rides for captains 321787
fetching rides data for :  2022-03-14  to  2022-03-20
 fetched rides for captains 291968
fetching rides data f

In [6]:
df_rides.head()

Unnamed: 0,captain_id,order_date,link_orders,delivery_orders,total_orders,week
0,5f79d21f1e87063a612a8a60,20220103,0,5,5,2022-01
1,60d6a29e569b2651e96a49cc,20220103,3,0,3,2022-01
2,5e0e28d3a10a71228f8b249d,20220108,0,20,20,2022-01
3,5dc7d146907d93538e4017d0,20220108,3,2,5,2022-01
4,5d6654dc35769811839657fd,20220108,0,1,1,2022-01


In [None]:
df_rides['order_date'] = pd.to_datetime(df_rides['order_date'], format='%Y%m%d')

In [83]:
# reference date : 31 Jan
# lookout period : 1 month

period = 21
lookout = [28, 35, 42, 49, 56, 63, 70]
ref_date = pd.to_datetime('2022-01-30', format='%Y-%m-%d')
df_ret = pd.DataFrame()
for l in lookout:
    for d in range(1, period):
        df_base = df_rides[(df_rides['order_date']>= (ref_date+timedelta(days=-6))) & (df_rides['order_date']<= ref_date)][['captain_id']].drop_duplicates()
        current = df_rides[(df_rides['order_date']>=(ref_date+timedelta(days=1))) & (df_rides['order_date']<=(ref_date+timedelta(days=d)))][['captain_id']].drop_duplicates()
        df_lookout = df_rides[(df_rides['order_date']>=(ref_date+timedelta(days=d+1))) & (df_rides['order_date']<=(ref_date+timedelta(days=d+l)))][['captain_id']].drop_duplicates()
        absent = df_base[~df_base['captain_id'].isin(current['captain_id'].unique())]
        absent_retained = absent[absent['captain_id'].isin(df_lookout['captain_id'].unique())]
        r = str(round(len(absent_retained)/len(absent)*100.00,2))
        x = pd.DataFrame({"days_absent" : [d], "turn_up_percent" : [r], "lookout" : [l]})
        df_ret = pd.concat([df_ret, x])

In [82]:
df_rides[(df_rides['order_date']>= (ref_date+timedelta(days=-6))) & (df_rides['order_date']<= ref_date)][['captain_id']].drop_duplicates()

Unnamed: 0,captain_id
0,5b628c3a5036cb268f629a84
1,61823bbb16ec525759d38b8f
2,60210629a6cc790a9722e5cf
3,5d3dbd25bcfa850af7c46e4c
4,5c63c950f2edc73367578119
...,...
257124,5fb529fde4116213a822d1c1
257126,5d94a2ad941a7d1c51d0a934
257128,5e0476fa8669fa330a3d2813
257135,5d0ced06c2a56b449f9e079b


In [59]:
df_ret.pivot_table(index='days_absent', columns = 'lookout', values = 'turn_up_percent', aggfunc = 'sum')

lookout,28,35,42,49,56,63,70
days_absent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,75.92,77.79,79.13,80.19,81.04,81.68,81.68
2,67.57,70.06,71.85,73.2,74.38,75.06,75.06
3,61.58,64.42,66.49,68.08,69.45,70.09,70.09
4,57.33,60.37,62.61,64.35,65.82,66.36,66.36
5,53.76,56.89,59.24,61.16,62.73,63.13,63.13
6,50.25,53.5,55.95,58.02,59.67,59.89,59.89
7,47.38,50.79,53.27,55.36,57.12,57.12,57.12
8,44.79,48.12,50.75,52.88,54.45,54.45,54.45
9,42.61,46.04,48.62,50.9,52.2,52.2,52.2
10,40.57,44.03,46.69,48.97,50.04,50.04,50.04


In [60]:
df_base = df_rides[(df_rides['order_date']>= '2022-01-24') & (df_rides['order_date']<= '2022-01-30')]
current = df_rides[(df_rides['order_date']>='2022-01-31') & (df_rides['order_date']<='2022-02-09')]
df_lookout = df_rides[(df_rides['order_date']>='2022-02-10') & (df_rides['order_date']<='2022-03-09')]
absent = df_base[~df_base['captain_id'].isin(current['captain_id'].unique())]
absent_retained = absent[absent['captain_id'].isin(df_lookout['captain_id'].unique())]
absent_absent = absent[~absent['captain_id'].isin(df_lookout['captain_id'].unique())]

In [61]:
x = df_rides[(df_rides['order_date']>='2022-01-24') & (df_rides['order_date']<='2022-01-30')]
x=x.groupby('captain_id').agg(sum).reset_index()
x[x['captain_id'].isin(absent_absent['captain_id'].unique())]['total_orders'].describe()

count    15783.000000
mean         6.099031
std          8.039592
min          1.000000
25%          1.000000
50%          3.000000
75%          7.000000
max        107.000000
Name: total_orders, dtype: float64

In [62]:
x = df_rides[(df_rides['order_date']>='2022-01-24') & (df_rides['order_date']<='2022-01-30')]
x=x.groupby('captain_id').agg(sum).reset_index()
x[x['captain_id'].isin(absent_retained['captain_id'].unique())]['total_orders'].describe()

count    9313.000000
mean        7.225921
std         9.637416
min         1.000000
25%         2.000000
50%         4.000000
75%         9.000000
max        94.000000
Name: total_orders, dtype: float64

In [63]:
absent.shape

(46329, 6)