In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pyhive import presto
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")
import my_func

presto_conn = presto.connect(
    host='bi-presto.serving.data.production.internal',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [12]:
start_date = '2021-08-30'
end_date = '2021-09-26'
city = 'Hyderabad'

In [6]:
# POC Segment

def get_poc(dt, city):
    print("fetching poc segments for : ",dt)
    orders_query = """   select captain_id,
                                segment,
                                substr(recency_segment, 1, strpos(recency_segment, '_')-1) as recency,
                                performance,
                                opportunity,
                                consistency,
                                rpr,
                                active_days
                            from
                                datasets_internal.captain_quality_segments_test
                                where yyyymmdd = '{sd}' 
                                    and city_name = '{ct}'
               """.format(sd = dt.replace('-',''), ct = city)
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched segment data : ", len(df_lh))

    return df_lh

poc_segment = pd.DataFrame()
segment = get_poc(end_date, city)
poc_segment = pd.concat([poc_segment,segment])

poc_segment.head()

fetching poc segments for :  2021-09-26
 fetched segment data :  32909


Unnamed: 0,captain_id,segment,recency,performance,opportunity,consistency,rpr,active_days
0,5e36778926b694cbd6c6445e,LP_D_LO,active,1.5,3.0,daily,1.5,4
1,5e46b011d76b096be590fd36,LP_Intra_LO,active,3.0,4.0,intra_week,3.0,9
2,5e4b6dd5117d0a6fcfd45b72,ZP,churn,0.0,3.0,inter_week,0.0,0
3,5e5b3f599dc8058865a27a5a,MP_Intra_MO,dormant,6.0,7.0,intra_week,6.0,5
4,5e6347ad6ad99b403f5f0b24,LP_D_MO,active,1.5,5.5,daily,3.5,11


In [20]:
date_list = [end_date]
df_segment = pd.DataFrame()
for dt in date_list:
        sd = (pd.to_datetime(dt) - timedelta(27)).strftime('%Y%m%d')
        ed = dt.replace('-', '')
        ct = 'Hyderabad'
        print(f'fetching segments for {ct} from {sd} to {ed}')
        %run Segments_updated.py '{sd}' '{ed}' '{ct}'
        segment = pd.read_csv('Captain_Segment.csv')
        segment['city'] = ct
        segment['week'] = pd.to_datetime(dt).strftime('%V')
        segment['week'] = segment['week'].apply(pd.to_numeric)
        df_segment = pd.concat([df_segment, segment])
        print(f'done')

fetching segments for Hyderabad from 20220131 to 20220227
done


In [11]:
# Pings

def get_pings(start_date, end_date, city):
    print("fetching pings data from : ", start_date," to ", end_date)
    orders_query = """   select 
        captain_id,
        yyyymmdd,
        (sum(case when event_type in ('accepted') then 1 else 0 end)
            + sum(case when event_type in ('rider_busy') then 1 else 0 end)
            + sum(case when event_type in ('rider_reject') then 1 else 0 end)) as Total_Pings,
            
        (count(distinct(case when event_type='dropped' then order_id end))+
            (count(distinct(case when event_type='rider_busy' then order_id end)))+
            (count(distinct(case when event_type='rider_reject' then order_id end)))+
            (count(distinct(case when event_type='rider_cancelled' then order_id end)))+
            (count(distinct(case when event_type='customer_cancelled' and cancel_reason in ('Asked to change payment mode','Asked to pay extra','Drop location denied','Asked to cancel and take offline ride','Taking longer than expected') then order_id end)))) as True_total_pings,
        
        (count(distinct(case when event_type='accepted' then order_id end))) as accepted_pings,
        
        (count(distinct(case when event_type='dropped' then order_id end))) as dropped_orders
        
        --(count(distinct(case when event_type='dropped' and split(replace(replace(replace(map_riders,'"'),'['),']'), ',')[1] = captain_id then order_id end))+
        --    (count(distinct(case when event_type='rider_busy' and split(replace(replace(replace(map_riders,'"'),'['),']'), ',')[1] = captain_id then order_id end)))+
        --    (count(distinct(case when event_type='rider_reject' and split(replace(replace(replace(map_riders,'"'),'['),']'), ',')[1] = captain_id then order_id end)))+
        --    (count(distinct(case when event_type='rider_cancelled' and split(replace(replace(replace(map_riders,'"'),'['),']'), ',')[1] = captain_id then order_id end)))+
        --    (count(distinct(case when event_type='customer_cancelled' and cancel_reason in ('Asked to change payment mode','Asked to pay extra','Drop location denied','Asked to cancel and take offline ride','Taking longer than expected') and split(replace(replace(replace(map_riders,'"'),'['),']'), ',')[1] = captain_id then order_id end)))) as True_total_first_pings

    from orders.order_logs_immutable as od
    where yyyymmdd between '{sd}' and '{ed}'
        and service_obj_service_name = 'Link'
        and service_obj_service_name not like '%Auto%'
        and order_type not like '%auto%'
        and lower(service_obj_city_display_name) = '{ct}'
    group by 1,2     
               """.format(sd = start_date.replace('-',''), ed = end_date.replace('-',''), ct = city.lower())
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched pings data : ", len(df_lh))

    return df_lh


#df_pings = pd.DataFrame()

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 0]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

for day in date_list : 
    pings = get_pings(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'), city)
    #pings['week'] = pd.to_datetime(day).strftime('%V')
    #pings['week'] = pings['week'].apply(pd.to_numeric)
    df_pings = pd.concat([df_pings, pings])

df_pings.head()

fetching pings data from :  2021-09-06  to  2021-09-12
 fetched pings data :  53130
fetching pings data from :  2021-09-13  to  2021-09-19
 fetched pings data :  59292
fetching pings data from :  2021-09-20  to  2021-09-26
 fetched pings data :  55767


Unnamed: 0,captain_id,yyyymmdd,Total_Pings,True_total_pings,accepted_pings,dropped_orders
0,5cc0584954bc7263ff4bbfb4,20210902,13,11,8,6
1,601103c573a5453f791670f0,20210830,10,10,4,4
2,5d4688e5f3dbe16ba31e9533,20210904,37,34,13,9
3,60fe3d00a0ea0d109574a05e,20210830,14,14,14,13
4,5e5c6978b46fc53973f5e573,20210901,21,16,21,15


In [22]:
# Login hours

def get_login_hours(start_date, end_date, city):
    print("fetching Login hours for : ",start_date," to ", end_date)
    orders_query = """   SELECT 
            userid as rider, 
            yyyymmdd,
            day_of_week(DATE(date_parse(yyyymmdd,'%Y%m%d'))) as day_of_week,
            cast(sum(duration) as double)/cast((60*60*1000) as double) as login_hours,
            SUM(CASE WHEN quarter_hour >= '0600' and quarter_hour <= '1159' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS MORNING_DURATION,
            SUM(CASE WHEN quarter_hour >= '1200' and quarter_hour <= '1659' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS AFTERNOON_DURATION,
            SUM(CASE WHEN quarter_hour >= '1700' and quarter_hour <= '2359' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS EVENING_DURATION
        FROM hive.datasets.captain_login_hours
        WHERE yyyymmdd >= '{sd}' AND yyyymmdd <= '{ed}'
        AND status in ('2','3','6','7','8','10')
        AND userid in (SELECT captainId from datasets.captain_single_view WHERE (lower(registeredcity) in ('{ct}') or lower(lastridecity) in ('{ct}')) AND activationdate is not null)
        GROUP BY 1,2,3      
               """.format(sd = start_date.replace('-',''), ed = end_date.replace('-',''), ct = city.lower())
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched Login hours", len(df_lh))

    return df_lh

df_login_hours = pd.DataFrame()

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 0]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

for day in date_list : 
    login_hours = get_login_hours(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'), city)
    x = pd.to_datetime(day).strftime('%V')
    login_hours['week'] = x
    login_hours['week'] = login_hours['week'].apply(pd.to_numeric)
    df_login_hours = pd.concat([df_login_hours,login_hours])

df_login_hours.head()

fetching Login hours for :  2022-01-31  to  2022-02-06
 fetched Login hours 237125
fetching Login hours for :  2022-02-07  to  2022-02-13
 fetched Login hours 222421
fetching Login hours for :  2022-02-14  to  2022-02-20
 fetched Login hours 221197
fetching Login hours for :  2022-02-21  to  2022-02-27
 fetched Login hours 235104


Unnamed: 0,rider,yyyymmdd,day_of_week,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION,week
0,605b588e9203457ddfb3af7f,20220202,3,1.998889,0.200278,1.798611,0.0,5
1,605b588e9203457ddfb3af7f,20220203,4,3.700278,1.979722,1.720556,0.0,5
2,605b588e9203457ddfb3af7f,20220204,5,5.401111,2.5075,2.893611,0.0,5
3,605b588e9203457ddfb3af7f,20220205,6,0.893333,0.016667,0.876667,0.0,5
4,605b588e9203457ddfb3af7f,20220206,7,0.06,0.06,0.0,0.0,5


In [23]:
# df_segment.to_csv('segment_for_total_ping_model_analysis.csv', index=False)
# df_pings.to_csv('pings_for_total_ping_model_analysis.csv', index=False)
# df_login_hours.to_csv('login hours_for_total_ping_model_analysis.csv', index=False)

In [69]:
df_segment = pd.read_csv('segment_for_total_ping_model_analysis.csv')
df_pings = pd.read_csv('pings_for_total_ping_model_analysis.csv')
df_login_hours = pd.read_csv('login hours_for_total_ping_model_analysis.csv')

In [46]:
df_tmp = df_pings[df_pings['yyyymmdd']>='20220221']
df_tmp = df_tmp[~df_tmp['captain_id'].isna()]
df_tmp = df_tmp[df_tmp['captain_id']!='']

df_lh = df_login_hours[['rider','yyyymmdd','login_hours']]
df_lh['yyyymmdd'] = df_lh['yyyymmdd'].astype(str)

df_tmp = df_segment[['RIDER','Segment']].merge(df_tmp.rename(columns={'captain_id':'RIDER'}), how = 'outer', on = 'RIDER').merge(
                            df_lh.rename(columns={'rider':'RIDER'}), how = 'left', on = ['RIDER','yyyymmdd'])

x = df_tmp.pivot_table(index='yyyymmdd', aggfunc = {'RIDER':'nunique',
                                                'Total_Pings':'sum',
                                                'accepted_pings':'sum',
                                                'dropped_orders':'sum'}).reset_index().rename(columns={'RIDER':'total_captains'})

In [47]:
y = df_tmp[df_tmp['accepted_pings']==0].pivot_table(index='yyyymmdd', aggfunc = {'RIDER':'nunique',
                                                                            'Total_Pings':'sum'}).reset_index().rename(columns={'RIDER':'captains_not_accepted_pings',
                                                                                                                               'Total_Pings':'pings_wasted'})

In [48]:
z = x.merge(y, how = 'inner', on = 'yyyymmdd')
z['percent_pings_wasted'] = round(z['pings_wasted']/z['Total_Pings']*100.0,2)
z['percent_captains_not_accepted_pings'] = round(z['captains_not_accepted_pings']/z['total_captains']*100.0,2)
z

Unnamed: 0,yyyymmdd,total_captains,Total_Pings,accepted_pings,dropped_orders,captains_not_accepted_pings,pings_wasted,percent_pings_wasted,percent_captains_not_accepted_pings
0,20220221,12343,173019.0,105351.0,68863.0,2090,9619.0,5.56,16.93
1,20220222,12825,140240.0,96630.0,66915.0,1771,6428.0,4.58,13.81
2,20220223,13161,139604.0,101258.0,70371.0,1790,5316.0,3.81,13.6
3,20220224,13303,137302.0,101497.0,70839.0,1658,4613.0,3.36,12.46
4,20220225,13180,156454.0,112318.0,76429.0,1696,5169.0,3.3,12.87
5,20220226,13822,162189.0,119385.0,81768.0,1795,5948.0,3.67,12.99
6,20220227,11743,146216.0,97108.0,63392.0,1946,9610.0,6.57,16.57


In [27]:
df_tmp[df_tmp['accepted_pings']==0].pivot_table(index=['yyyymmdd'], values = ['Total_Pings','login_hours'], aggfunc = 'mean').reset_index()

Unnamed: 0,yyyymmdd,Total_Pings,login_hours
0,20220221,4.602392,0.609089
1,20220222,3.629588,0.59291
2,20220223,2.969832,0.590486
3,20220224,2.782268,0.597547
4,20220225,3.047759,0.680592
5,20220226,3.313649,0.7129
6,20220227,4.938335,0.728661


In [57]:
pivot = df_tmp[df_tmp['accepted_pings']==0]
pivot['Segment'].fillna('ZP', inplace=True)
pivot.pivot_table(index=['Segment'], columns = ['yyyymmdd'], values = ['RIDER'], aggfunc = 'nunique').drop(['Segment','yyyymmdd'], axis=1).reset_index()

Unnamed: 0_level_0,Segment,RIDER,RIDER,RIDER,RIDER,RIDER,RIDER,RIDER
yyyymmdd,Unnamed: 1_level_1,20220221,20220222,20220223,20220224,20220225,20220226,20220227
0,HP_D_HO,38.0,35.0,31.0,19.0,24.0,18.0,42.0
1,HP_D_UHO,46.0,31.0,26.0,29.0,30.0,33.0,62.0
2,HP_Inter_HO,7.0,5.0,4.0,7.0,10.0,4.0,4.0
3,HP_Inter_UHO,3.0,2.0,1.0,3.0,5.0,5.0,3.0
4,HP_Intra_HO,8.0,10.0,9.0,7.0,16.0,9.0,3.0
5,HP_Intra_UHO,5.0,7.0,8.0,3.0,6.0,5.0,9.0
6,LP_D_HO,71.0,66.0,51.0,66.0,55.0,63.0,62.0
7,LP_D_LO,96.0,84.0,70.0,73.0,72.0,83.0,75.0
8,LP_D_MO,110.0,100.0,116.0,100.0,94.0,98.0,112.0
9,LP_D_UHO,37.0,37.0,29.0,35.0,31.0,46.0,42.0


In [58]:
df_tmp = df_pings
df_tmp = df_tmp[~df_tmp['captain_id'].isna()]
df_tmp = df_tmp[df_tmp['captain_id']!='']
df_tmp['week'] = pd.to_datetime(df_tmp['yyyymmdd'],format='%Y%m%d').dt.strftime('%V')
df_tmp = df_tmp.groupby(['captain_id','week']).agg('sum').reset_index()

df_lh = df_login_hours[['rider','week','login_hours']]
df_lh = df_lh.groupby(['rider','week']).agg('sum').reset_index()
df_lh['week'] = df_lh['week'].astype(str)

df_tmp = df_segment[['RIDER','Segment']].merge(df_tmp.rename(columns={'captain_id':'RIDER'}), how = 'outer', on = 'RIDER').merge(
                            df_lh.rename(columns={'rider':'RIDER'}), how = 'left', on = ['RIDER','week'])

x = df_tmp.pivot_table(index='week', aggfunc = {'RIDER':'nunique',
                                                'Total_Pings':'sum',
                                                'accepted_pings':'sum',
                                                'dropped_orders':'sum'}).reset_index().rename(columns={'RIDER':'total_captains'})

In [59]:
y = df_tmp[df_tmp['accepted_pings']==0].pivot_table(index='week', aggfunc = {'RIDER':'nunique',
                                                                            'Total_Pings':'sum'}).reset_index().rename(columns={'RIDER':'captains_not_accepted_pings',
                                                                                                                               'Total_Pings':'pings_wasted'})

In [60]:
z = x.merge(y, how = 'inner', on = 'week')
z['percent_pings_wasted'] = round(z['pings_wasted']/z['Total_Pings']*100.0,2)
z['percent_captains_not_accepted_pings'] = round(z['captains_not_accepted_pings']/z['total_captains']*100.0,2)
z

Unnamed: 0,week,total_captains,Total_Pings,accepted_pings,dropped_orders,captains_not_accepted_pings,pings_wasted,percent_pings_wasted,percent_captains_not_accepted_pings
0,5,27358,870545.0,540487.0,357416.0,4905,28281.0,3.25,17.93
1,6,27317,1023107.0,612497.0,392205.0,4894,31985.0,3.13,17.92
2,7,27907,1077911.0,682557.0,447211.0,4846,27975.0,2.6,17.36
3,8,29619,1055024.0,733547.0,498577.0,4635,21923.0,2.08,15.65


In [62]:
pivot = df_tmp[df_tmp['accepted_pings']==0]
pivot['Segment'].fillna('ZP', inplace=True)
pivot.pivot_table(index=['Segment'], columns = ['week'], values = ['RIDER'], aggfunc = 'nunique').drop(['Segment','week'], axis=1).reset_index()

Unnamed: 0_level_0,Segment,RIDER,RIDER,RIDER,RIDER
week,Unnamed: 1_level_1,05,06,07,08
0,HP_D_HO,49.0,49.0,40.0,18.0
1,HP_D_UHO,41.0,43.0,38.0,20.0
2,HP_Inter_HO,16.0,19.0,20.0,12.0
3,HP_Inter_UHO,10.0,7.0,8.0,9.0
4,HP_Intra_HO,13.0,16.0,11.0,6.0
5,HP_Intra_UHO,10.0,5.0,6.0,6.0
6,LP_D_HO,43.0,44.0,60.0,64.0
7,LP_D_LO,99.0,135.0,119.0,95.0
8,LP_D_MO,77.0,94.0,112.0,87.0
9,LP_D_UHO,16.0,20.0,30.0,37.0


In [73]:
df_pings

Unnamed: 0,captain_id,yyyymmdd,Total_Pings,True_total_pings,accepted_pings,dropped_orders,True_total_first_pings,week
0,5f3cd451979c255e1d019063,20220206,5,4,5,2,2,5
1,61404ae6c27efcc486198637,20220205,46,43,19,15,17,5
2,619f8d7817dd6a63c0013674,20220205,23,18,9,5,8,5
3,5ad5653b11131d700d941715,20220205,13,12,7,6,10,5
4,60752278cde8c65af3c01bae,20220203,8,8,1,1,0,5
...,...,...,...,...,...,...,...,...
90386,5e2ea445c2db07e56f292671,20220225,19,13,13,6,2,8
90387,5d71dff0d0286d106d7a7e77,20220227,3,2,1,0,0,8
90388,5c2d85f844742d49fd46b073,20220221,1,1,1,0,0,8
90389,5d51804c95e84053d7622505,20220224,14,9,14,9,1,8


In [108]:
df_tmp = df_pings[~df_pings['captain_id'].isna()]
df_tmp = df_tmp[df_tmp['captain_id']!='']
df_tmp['zero_accepted_days'] = df_tmp['accepted_pings'].apply(lambda x : 1 if x == 0 else 0)
df_tmp = df_tmp.groupby('captain_id').agg({'yyyymmdd':'nunique',
                                           'Total_Pings':'sum',
                                           'True_total_pings':'sum',
                                                'accepted_pings':'sum',
                                                'dropped_orders':'sum',
                                                'True_total_first_pings':'sum',
                                                'zero_accepted_days':'sum'}).reset_index().rename(columns={'yyyymmdd':'active_days'})

# df_lh = df_login_hours[['rider','yyyymmdd','login_hours']]
# df_lh['yyyymmdd'] = df_lh['yyyymmdd'].astype(str)

df_tmp = df_segment[['RIDER','Segment']].merge(df_tmp.rename(columns={'captain_id':'RIDER'}), how = 'outer', on = 'RIDER')#.merge(
                            #df_lh.rename(columns={'rider':'RIDER'}), how = 'left', on = ['RIDER','yyyymmdd'])
df_tmp['Segment'].fillna('ZP', inplace=True)
df_tmp['zero_accepted_captain'] = df_tmp.apply(lambda x : 1 if x.zero_accepted_days > 0 else 0, axis=1)
df_tmp['active_days_bucket'] = pd.cut(df_tmp.active_days, [0,5,10,15,20,28], include_lowest=True)
df_tmp

Unnamed: 0,RIDER,Segment,active_days,Total_Pings,True_total_pings,accepted_pings,dropped_orders,True_total_first_pings,zero_accepted_days,zero_accepted_captain,active_days_bucket
0,5a1d47d8e882417bb6d90f2b,HP_D_UHO,6.0,166.0,140.0,97.0,76.0,64.0,0.0,0,"(5.0, 10.0]"
1,5a676f5cc58c8d4bbc4a7335,LP_Inter_LO,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0,"(-0.001, 5.0]"
2,5b1b074a96262203e801ba63,LP_D_MO,9.0,63.0,54.0,40.0,20.0,23.0,2.0,1,"(5.0, 10.0]"
3,5b785d3f62d1065209b08944,MP_D_MO,12.0,111.0,94.0,103.0,76.0,60.0,0.0,0,"(10.0, 15.0]"
4,5b9299cc1d99d4286ba2223f,MP_D_HO,8.0,101.0,87.0,79.0,57.0,59.0,0.0,0,"(5.0, 10.0]"
...,...,...,...,...,...,...,...,...,...,...,...
52691,621b5598ac1f97f0dfaeb228,ZP,1.0,2.0,2.0,0.0,0.0,0.0,1.0,1,"(-0.001, 5.0]"
52692,621b5a9fac1f9773e9aeb762,ZP,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1,"(-0.001, 5.0]"
52693,621b5e9df0b26be1a45c31cc,ZP,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1,"(-0.001, 5.0]"
52694,621b7ab4c1a4e42bc65f284b,ZP,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1,"(-0.001, 5.0]"


In [109]:
df_tmp[['active_days','zero_accepted_days']].describe([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9])

Unnamed: 0,active_days,zero_accepted_days
count,52610.0,52610.0
mean,6.320699,0.994374
std,6.563755,1.700449
min,1.0,0.0
10%,1.0,0.0
20%,1.0,0.0
30%,2.0,0.0
40%,2.0,0.0
50%,4.0,1.0
60%,5.0,1.0


In [138]:
x = df_tmp[['RIDER','Segment','zero_accepted_days','active_days_bucket']].groupby(['Segment','active_days_bucket']).apply(lambda x : x.describe([0.25,0.5,0.75,0.80,0.9,0.95])).reset_index()
x[x.level_2.isin(['count','25%','50%', '75%','80%','90%', '95%'])].pivot_table(index=['Segment','active_days_bucket'], columns='level_2', values = 'zero_accepted_days').reset_index().to_csv('x.csv', index=False)

In [139]:
df_tmp.pivot_table(index=['Segment','active_days_bucket'], values = 'zero_accepted_captain', aggfunc = 'sum').reset_index().to_csv('x.csv', index=False)

In [125]:
df_tmp.to_csv('x.csv', index=False)

In [140]:
df_segment

Unnamed: 0,RIDER,MEDIAN_DBR,ADBR,captain_id,active_days,Performance,Opportunity,Actual_Captain_cancellations,Rider_cancellation,Captain_induced_CC,RPR,TrueDPR,CCR_Rate,Consistency,Segment,city,week
0,5a1d47d8e882417bb6d90f2b,0.0,0.400000,5a1d47d8e882417bb6d90f2b,6,10.5,22.0,0.0,0.0,0.0,10.5,0.669737,0.000000,daily,HP_D_UHO,Hyderabad,8
1,5a676f5cc58c8d4bbc4a7335,30.0,30.000000,5a676f5cc58c8d4bbc4a7335,1,0.0,1.0,1.0,1.0,0.0,0.0,0.000000,1.000000,inter_week,LP_Inter_LO,Hyderabad,8
2,5b1b074a96262203e801ba63,0.0,1.500000,5b1b074a96262203e801ba63,7,2.0,6.0,2.0,2.0,0.0,2.0,0.357143,0.285714,daily,LP_D_MO,Hyderabad,8
3,5b785d3f62d1065209b08944,0.0,0.818182,5b785d3f62d1065209b08944,12,6.0,7.5,1.0,0.0,0.0,6.0,0.875000,0.115132,daily,MP_D_MO,Hyderabad,8
4,5b9299cc1d99d4286ba2223f,0.0,0.142857,5b9299cc1d99d4286ba2223f,8,7.0,8.5,1.0,0.0,1.0,7.0,0.788889,0.129167,daily,MP_D_HO,Hyderabad,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43363,61bc5cde7d498253ecbcd75f,1.0,2.333333,61bc5cde7d498253ecbcd75f,4,2.5,6.0,0.0,0.0,0.0,2.5,0.666667,0.000000,intra_week,LP_Intra_MO,Hyderabad,8
43364,61bfe71617dd6aec7818fb74,0.0,0.130435,61bfe71617dd6aec7818fb74,24,18.0,23.0,1.0,0.0,1.0,18.0,0.787138,0.075499,daily,UHP_D_UHO,Hyderabad,8
43365,61d1821dcc7f113d6ed99fda,1.0,1.000000,61d1821dcc7f113d6ed99fda,5,4.0,7.0,0.0,0.0,0.0,4.0,0.428571,0.000000,intra_week,MP_Intra_MO,Hyderabad,8
43366,61d3af4d6fe6416bb1f0fbb6,0.0,0.200000,61d3af4d6fe6416bb1f0fbb6,21,8.0,16.0,2.0,1.0,1.0,8.0,0.538462,0.166667,daily,MP_D_HO,Hyderabad,8


In [148]:
x = [0]
med(x)

0

In [155]:
def med(lst):
    lst.sort()
    ln = len(lst)
    if ln == 1:
        return lst[0]
    elif ln % 2 == 0:
        return lst[int(ln/2)-1]
    else:
        return lst[int((ln-1)/2)]
    

In [282]:
perf_opp = df_pings[(~df_pings['captain_id'].isna()) & (df_pings['captain_id']!='')].sort_values(['captain_id','yyyymmdd']).reset_index().drop('index',axis=1)
perf_opp['accepted_pings_calc'] = perf_opp['accepted_pings'].apply(lambda x : np.NaN if x == 0 else x)
perf_opp['dropped_orders_calc'] = perf_opp.apply(lambda x : np.NaN if x.accepted_pings == 0 else x.dropped_orders, axis = 1)
perf_opp['True_total_pings_calc'] = perf_opp.apply(lambda x : np.NaN if x.accepted_pings == 0 else x.True_total_pings, axis = 1)
perf_opp['active_days_calc'] = perf_opp['dropped_orders'].apply(lambda x : np.NaN if x == 0 else x)

# opp = perf_opp.groupby('captain_id').apply(lambda x : med(x.True_total_pings_calc.to_list())).reset_index().rename(columns={0:'opportunity_calc'})
# per = perf_opp.groupby('captain_id').apply(lambda x : med(x.dropped_orders_calc.to_list())).reset_index().rename(columns={0:'performance_calc'})
# active_days = perf_opp.groupby('captain_id').agg({'active_days_calc':'count'}).reset_index()

# df_perf_opp = opp.merge(per, how = 'outer', on ='captain_id').merge(active_days, how = 'outer', on = 'captain_id')

df_perf_opp = perf_opp.pivot_table(index='captain_id', aggfunc = {'True_total_pings_calc':'median',
                                                                'dropped_orders_calc':'median',
                                                                'active_days_calc':'count'}).reset_index().rename(columns={'True_total_pings_calc':'opportunity_calc',
                                                                                                                          'dropped_orders_calc':'performance_calc'})

In [257]:
consistency = df_pings[(~df_pings['captain_id'].isna()) & (df_pings['captain_id']!='')].sort_values(['captain_id','yyyymmdd']).reset_index().drop('index',axis=1)
consistency = consistency[consistency['accepted_pings']>0]

consistency['yyyymmdd_shift'] = consistency['yyyymmdd'].shift(1)
consistency['captain_id_shift'] = consistency['captain_id'].shift(1)
consistency['dbr'] = consistency.apply(lambda x : (pd.to_datetime(x.yyyymmdd) - pd.to_datetime(x.yyyymmdd_shift)).days - 1 if x.captain_id == x.captain_id_shift else np.NaN, axis = 1)

#df_consistency = consistency.groupby('captain_id').apply(lambda x : med(x.dbr.to_list())).reset_index().rename(columns={0:'median_dbr'})
df_consistency = consistency.pivot_table(index='captain_id', values = 'dbr', aggfunc = 'median').reset_index().rename(columns={'dbr':'median_dbr'})


In [283]:
df_captain = poc_segment.merge(df_perf_opp, how = 'left', on = 'captain_id').merge(df_consistency, how = 'left', on = 'captain_id')

df_captain['p'] = df_captain['performance_calc'].apply(lambda x : 'LP' if x < 4 or pd.isna(x)
                                                              else 'MP' if x < 8 
                                                              else 'HP' if x < 16 
                                                              else 'UHP')
df_captain['c'] = df_captain['median_dbr'].apply(lambda x : 'D' if x < 1 
                                                      else 'Intra' if x <= 7
                                                      else 'Inter')  
df_captain['o'] = df_captain['opportunity_calc'].apply(lambda x : 'LO' if x < 4 or pd.isna(x)
                                                              else 'MO' if x < 8 
                                                              else 'HO' if x < 16 
                                                              else 'UHO')
df_captain['segment_calc'] = df_captain['p'] + '_' + df_captain['c'] + '_' + df_captain['o']

In [284]:
df_captain[(df_captain['performance']!=df_captain['performance_calc']) & (df_captain['segment']!='ZP')]

Unnamed: 0,captain_id,segment,recency,performance,opportunity,consistency,rpr,active_days,opportunity_calc,active_days_calc,performance_calc,median_dbr,p,c,o,segment_calc
44,6063715b03f2a49e30238d55,MP_Intra_MO,dormant,4.0,6.0,intra_week,4,5,8.0,5,5.0,1.0,MP,Intra,HO,MP_Intra_HO
48,609d2a9a635553ee0e73558d,MP_D_MO,active,3.5,7.5,daily,,1,8.0,1,4.0,0.0,MP,D,HO,MP_D_HO
50,60b1b423270b9c9c1fdd28fb,LP_Inter_LO,inactive,3.0,4.0,inter_week,3,1,6.0,1,5.0,,MP,Inter,MO,MP_Inter_MO
54,60eaf36cac6ff43343619dff,MP_Intra_HO,active,8.0,12.0,intra_week,8,9,12.0,9,9.0,1.0,HP,Intra,HO,HP_Intra_HO
83,5c3e4d539359a37c0db65d20,MP_D_HO,active,6.5,9.0,daily,6.5,16,9.0,16,7.0,0.0,MP,D,HO,MP_D_HO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32843,5fb17f897e483f273cfadb32,LP_Intra_LO,dormant,0.0,2.0,intra_week,,1,2.5,2,0.5,4.0,LP,Intra,LO,LP_Intra_LO
32850,5fd8b44962c3aa55056cd7d2,LP_Intra_MO,active,3.0,6.0,intra_week,6,6,8.0,8,5.0,1.0,MP,Intra,HO,MP_Intra_HO
32866,608aab8927231836ed1e6d83,MP_Intra_HO,dormant,8.0,14.0,intra_week,8,3,15.0,3,9.0,0.5,HP,D,HO,HP_D_HO
32883,611fa76234b93d4a402d41be,MP_D_UHO,active,8.0,17.5,daily,12,7,17.5,7,8.5,0.0,HP,D,UHO,HP_D_UHO


In [286]:
perf_opp[perf_opp['captain_id']=='609d2a9a635553ee0e73558d']

Unnamed: 0,captain_id,yyyymmdd,Total_Pings,True_total_pings,accepted_pings,dropped_orders,accepted_pings_calc,dropped_orders_calc,True_total_pings_calc,active_days_calc
187247,609d2a9a635553ee0e73558d,20210925,20,15,16,8,16.0,8.0,15.0,8.0
187248,609d2a9a635553ee0e73558d,20210926,3,1,2,0,2.0,0.0,1.0,


In [265]:
df_perf_opp[df_perf_opp['captain_id']=='5a13002edad1196ec60d2e36']

Unnamed: 0,captain_id,opportunity_calc,performance_calc,active_days_calc
413,5a13002edad1196ec60d2e36,,,1


In [234]:
df_captain[df_captain['active_days']!=df_captain['active_days_calc']]

Unnamed: 0,captain_id,segment,recency,performance,opportunity,consistency,rpr,active_days,opportunity_calc,performance_calc,active_days_calc,median_dbr,p,c,o,segment_calc
161,5e512e5fd001c111c21ee2af,ZP,churn,0.0,0.5,intra_week,0,0,1.0,1.0,1,3.0,LP,Intra,LO,LP_Intra_LO
196,6069c9e0b26846bed0dc53b1,MP_D_HO,dormant,5.0,10.0,daily,7,3,9.0,4.0,4,0.0,MP,D,HO,MP_D_HO
291,5cdfdc27377155163cfda8c4,ZP,churn,0.0,5.0,inter_week,0,0,6.0,1.0,1,,LP,Inter,MO,LP_Inter_MO
375,5fe49ed84319a7e3b5b52dfe,LP_D_HO,active,1.0,9.0,daily,4,18,9.0,2.0,22,0.0,LP,D,HO,LP_D_HO
388,60694405be0bde054a5ed91a,LP_Intra_LO,active,1.0,4.0,intra_week,,1,3.0,1.0,2,1.0,LP,Intra,LO,LP_Intra_LO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32707,60c9545e83f1e2829a3b7281,LP_D_HO,active,1.0,9.0,daily,1,8,,,10,0.0,LP,D,LO,LP_D_LO
32755,5c360b034a267149c76acfc7,LP_D_MO,active,1.0,7.5,daily,12,12,9.0,1.0,15,0.0,LP,D,HO,LP_D_HO
32770,5cc1ba0b3d65ca5e2562cdc7,MP_D_HO,active,6.0,12.5,daily,8,12,11.0,5.0,13,0.0,MP,D,HO,MP_D_HO
32843,5fb17f897e483f273cfadb32,LP_Intra_LO,dormant,0.0,2.0,intra_week,,1,2.0,0.0,2,4.0,LP,Intra,LO,LP_Intra_LO


In [263]:
perf_opp[perf_opp['captain_id']=='5a13002edad1196ec60d2e36'].pivot_table(index='captain_id', aggfunc = {'dropped_orders_calc':'median',
                                                                                                       'True_total_pings_calc':'median',
                                                                                                       'active_days_calc':'count'}).reset_index()

Unnamed: 0,captain_id,True_total_pings_calc,active_days_calc,dropped_orders_calc
0,5a13002edad1196ec60d2e36,12.0,1,5.0


In [253]:
x = poc_segment.merge(df_pings, how = 'left', on = 'captain_id')
x.head()

Unnamed: 0,captain_id,segment,recency,performance,opportunity,consistency,rpr,active_days,yyyymmdd,Total_Pings,True_total_pings,accepted_pings,dropped_orders
0,5e36778926b694cbd6c6445e,LP_D_LO,active,1.5,3.0,daily,1.5,4,20210918,3,2,3,1
1,5e36778926b694cbd6c6445e,LP_D_LO,active,1.5,3.0,daily,1.5,4,20210919,3,3,1,1
2,5e36778926b694cbd6c6445e,LP_D_LO,active,1.5,3.0,daily,1.5,4,20210925,5,3,5,2
3,5e36778926b694cbd6c6445e,LP_D_LO,active,1.5,3.0,daily,1.5,4,20210924,8,6,6,3
4,5e46b011d76b096be590fd36,LP_Intra_LO,active,3.0,4.0,intra_week,3.0,9,20210831,6,6,4,3


In [255]:
x.to_csv('x.csv', index=False)