```Levels with monthly cycles - pull the distribution for various metrics based on rides.
Levels with weekly cycles -  pull the distribution for various metrics based on rides.
Calculate captain levels in Feb/Mar and measure the movement of captains across levels for both weekly and monthly cycles ```

In [3]:
import pandas as pd
import pandasql as ps
from datetime import datetime, timedelta
from pyhive import presto
presto_conn = presto.connect(
    host='presto.processing.yoda.run',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [4]:
dstart =  pd.to_datetime('2021-01-01')
dend =  pd.to_datetime('2021-03-31')

date_list = [dstart + timedelta(days=x) for x in range((dend-dstart).days + 1)
        if (dstart + timedelta(days=x)).weekday() == 6]

date_list = [x.strftime('%Y-%m-%d') for x in date_list]
date_list

['2021-01-03',
 '2021-01-10',
 '2021-01-17',
 '2021-01-24',
 '2021-01-31',
 '2021-02-07',
 '2021-02-14',
 '2021-02-21',
 '2021-02-28',
 '2021-03-07',
 '2021-03-14',
 '2021-03-21',
 '2021-03-28']

In [5]:

def get_rf_segments(start):
    
    print("fetching rf Segments data for",start)

    orders_query = """SELECT captainid as rider, day, frequency_segment, recency_segment
                      from datasets.captain_link_immutable
                      where day = date('{sd}')
                      AND segment = 'RF'                      
               """.format( sd = start)
    
#     print(orders_query)

    df_orders_data = pd.read_sql(orders_query, presto_conn)
    
    print(" fetching completed for rf segments data", len(df_orders_data))

    return df_orders_data

rf_segments = pd.DataFrame()

for day in date_list : 

    week_rf = get_rf_segments(day)
    rf_segments = pd.concat([rf_segments,week_rf])
    


rf_segments['recency_segment'] = rf_segments['recency_segment'].apply(lambda x : x.title())
rf_segments['frequency_segment'] = rf_segments['frequency_segment'].apply(lambda x : x.title())
rf_segments['Segment'] = rf_segments['recency_segment'] +'-'+ rf_segments['frequency_segment']
rf_segments['week'] = pd.to_datetime(rf_segments['day']).apply(lambda x : (x + timedelta(1)).strftime('%Y-%V'))
rf_segments


fetching rf Segments data for 2021-01-03
 fetching completed for rf segments data 178253
fetching rf Segments data for 2021-01-10
 fetching completed for rf segments data 178239
fetching rf Segments data for 2021-01-17
 fetching completed for rf segments data 179151
fetching rf Segments data for 2021-01-24
 fetching completed for rf segments data 181670
fetching rf Segments data for 2021-01-31
 fetching completed for rf segments data 183587
fetching rf Segments data for 2021-02-07
 fetching completed for rf segments data 186024
fetching rf Segments data for 2021-02-14
 fetching completed for rf segments data 189475
fetching rf Segments data for 2021-02-21
 fetching completed for rf segments data 191620
fetching rf Segments data for 2021-02-28
 fetching completed for rf segments data 194546
fetching rf Segments data for 2021-03-07
 fetching completed for rf segments data 195506
fetching rf Segments data for 2021-03-14
 fetching completed for rf segments data 195922
fetching rf Segments 

Unnamed: 0,rider,day,frequency_segment,recency_segment,Segment,week
0,5a0cf5e2727a4a3ef3d10e1d,2021-01-03,Midperforming,Recent,Recent-Midperforming,2021-01
1,5b45932a7ea5462554cc77cf,2021-01-03,Underperforming,Recent,Recent-Underperforming,2021-01
2,5b47501be6982857b8ba0566,2021-01-03,Underperforming,Recent,Recent-Underperforming,2021-01
3,5bac9c0ffee607761aaf936e,2021-01-03,Underperforming,Recent,Recent-Underperforming,2021-01
4,5bea64b4e9a4e0588b3d3d7b,2021-01-03,Highperforming,Recent,Recent-Highperforming,2021-01
...,...,...,...,...,...,...
198214,5c835fc98c352421eae0da52,2021-03-28,Midperforming,Dormant,Dormant-Midperforming,2021-13
198215,5cc828d93d65ca5e2566b5b0,2021-03-28,Midperforming,Dormant,Dormant-Midperforming,2021-13
198216,5cd78ac6377155163cf79832,2021-03-28,Midperforming,Dormant,Dormant-Midperforming,2021-13
198217,5dd7658753a0f569a718be7b,2021-03-28,Underperforming,Inactive,Inactive-Underperforming,2021-13


In [6]:
rf_segments = rf_segments[['rider','week','Segment']]
rf_segments
rf_segments.to_csv('rf_segments.csv',index=False)

In [7]:
rides_data = pd.read_csv('captain_weekly_rides_rating.csv')

rides_data['week'] = pd.to_datetime(rides_data['order_week'])
rides_data['week'] = rides_data['week'].apply(lambda x : x.strftime('%Y-%V'))
rides_data.head()

Unnamed: 0,rider,order_week,wdays,all_service_rides,rating,week
0,5dd3ecd81f60ef2568f1e1d8,2021-03-22,1,4,0.0,2021-12
1,6054406203f2a4e7e21adcda,2021-03-15,2,9,4.5,2021-11
2,5facd90c90d77918ccb94b67,2021-02-22,2,2,5.0,2021-08
3,5e351b799fa82fa185ca47a9,2021-02-01,6,73,4.2,2021-05
4,5af800ff1d97d8126f26cf94,2021-01-18,3,22,4.4,2021-03


In [8]:
rides_data.groupby(['week'])['all_service_rides'].describe(percentiles=[0.60,0.70,0.80,0.85,0.90,0.95,0.99])

Unnamed: 0_level_0,count,mean,std,min,50%,60%,70%,80%,85%,90%,95%,99%,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2020-53,65605.0,7.229647,8.068107,1.0,4.0,6.0,8.0,11.0,14.0,17.0,24.0,38.0,97.0
2021-01,107724.0,12.787624,16.516995,1.0,6.0,9.0,13.0,20.0,25.0,33.0,46.0,79.0,209.0
2021-02,101711.0,12.534544,16.082226,1.0,6.0,9.0,13.0,20.0,25.0,32.0,46.0,78.0,205.0
2021-03,112013.0,12.995384,16.679967,1.0,7.0,10.0,14.0,20.0,25.0,33.0,47.0,80.0,216.0
2021-04,112604.0,12.691698,16.130445,1.0,7.0,9.0,13.0,20.0,25.0,32.0,45.0,77.0,199.0
2021-05,116352.0,13.100187,16.997746,1.0,7.0,9.0,14.0,20.0,26.0,34.0,47.0,82.0,217.0
2021-06,117490.0,13.451494,17.347674,1.0,7.0,10.0,14.0,21.0,26.0,34.0,49.0,83.0,207.0
2021-07,115410.0,13.202721,16.927506,1.0,7.0,10.0,14.0,21.0,26.0,34.0,48.0,81.0,196.0
2021-08,117788.0,13.425841,17.22613,1.0,7.0,10.0,14.0,21.0,26.0,34.0,49.0,83.0,224.0
2021-09,116206.0,13.797498,17.659609,1.0,7.0,10.0,14.0,22.0,27.0,35.0,50.0,85.0,222.0


In [9]:
rider_cancelled = pd.read_csv('Rider_Pings.csv')
rider_cancelled.head()

Unnamed: 0,captainid,yyyymmdd,rider_cancelled_pings,net_rides,rider_rejected_pings,rider_busy_pings,accepted_pings
0,5d660d83a24c96105e297bbf,20210309,0,12,0,4,14
1,5d84973f486b0b21476fd033,20210308,0,4,0,0,6
2,5c6f8bd5ba8dd579bec7b5c1,20210312,2,2,0,1,17
3,5f50ed735af9d2385c4f454a,20210331,1,8,0,2,11
4,5cc060843d65ca5e2561da7b,20210327,0,3,0,0,5


In [None]:
#captain_ltr_query = """   SELECT  captainid, day, sum(lifetimenetrides) as LTR
#    from 
#    datasets_internal.captain_servicedetail_ridebehavior_immutable_v1 
#    WHERE 
#    day in (date '2021-01-03', date '2021-01-10', date '2021-01-17',  date '2021-01-31', date '2021-02-07', date '2021-02-14', date '2021-02-21', date '2021-02-28', date '2021-03-07', date '2021-03-14', date '2021-03-21', date '2021-03-28')
#    group by 1, 2"""

#print(captain_ltr_query)
#captain_ltr = pd.read_sql(captain_ltr_query, presto_conn)
#captain_ltr.head()

In [None]:
#captain_ltr[captain_ltr['captainid']=='5d8376c1f354c15d38c49b69']

In [None]:
#len(captain_ltr)

In [None]:
#captain_ltr['week'] = pd.to_datetime(captain_ltr['day'])
#captain_ltr['week'] = captain_ltr['week'] + timedelta(1)
#captain_ltr.head()

In [None]:
#captain_ltr['week'] = captain_ltr['week'].apply(lambda x : x.strftime('%Y-%V'))
#captain_ltr.head()

In [None]:
#captain_ltr.to_csv('Captain_LTR_weekly.csv',index=False)

In [10]:
captain_ltr = pd.read_csv('Captain_LTR_weekly.csv')

In [11]:
### Captain Weekly LTR

captain_ltr['ltr_bucket'] = captain_ltr['LTR'].apply(lambda x : '01-05' if x <= 5 else 
                                                   ('6-50' if (x>=6) and (x<=50) 
                                                    else('50-100' if (x>50) and (x<=100) else
                                                        ('100-250' if (x>100) and (x<=250) else
                                                        ('250-550' if (x>250) and (x<=550) else 
                                                        (('550-800' if (x>550) and (x<=800) else 
                                                          (('800-1200' if (x>800) and (x<=1200) else 
                                                          '1200+')))))))))
print(captain_ltr.head())
     
captain_ltr = captain_ltr[['captainid','week','LTR','ltr_bucket']]
captain_ltr

                  captainid         day  LTR     week ltr_bucket
0  5d79bd31d0286d106d872f32  2021-03-14   20  2021-11       6-50
1  5dd58872cb7247271319dd5a  2021-03-14  168  2021-11    100-250
2  5c892d5a8c352421eae88b27  2021-03-14  694  2021-11    550-800
3  5d44f302f3dbe16ba31c7102  2021-03-14   14  2021-11       6-50
4  5dfcdb4e9a6d72727b189c49  2021-03-14   24  2021-11       6-50


Unnamed: 0,captainid,week,LTR,ltr_bucket
0,5d79bd31d0286d106d872f32,2021-11,20,6-50
1,5dd58872cb7247271319dd5a,2021-11,168,100-250
2,5c892d5a8c352421eae88b27,2021-11,694,550-800
3,5d44f302f3dbe16ba31c7102,2021-11,14,6-50
4,5dfcdb4e9a6d72727b189c49,2021-11,24,6-50
...,...,...,...,...
14407705,5d218b58668011467e2a61f2,2021-01,88,50-100
14407706,5dc66a9b079aed53a602e29d,2021-01,44,6-50
14407707,5ca9af1c194e920db09aac45,2021-01,11,6-50
14407708,5c1fa3c6d9bacb2f6f9d63dc,2021-01,5,01-05


In [12]:
### DPR Data

#dpr = pd.read_csv('DPR.csv')
#dpr['yyyymmdd'] = dpr['yyyymmdd'].apply(str)
rider_cancelled['yyyymmdd'] = rider_cancelled['yyyymmdd'].apply(str)


#dpr = dpr.merge(rider_cancelled, how='left', on=['captainid','yyyymmdd'])

#dpr['week'] = dpr['yyyymmdd'].apply(lambda x : datetime.strptime(x, '%Y%m%d').strftime('%Y-%V'))

#dpr_week = dpr.groupby(['captainid','week'],as_index=False).agg({'net_rides':'sum',
#                                                                  'accepted_pings':'sum',
#                                                                  'rider_busy_pings':'sum',
#                                                                  'rider_reject_pings':'sum',
#                                                                  'rider_cancelled_pings':'sum'})

rider_cancelled['week'] = rider_cancelled['yyyymmdd'].apply(lambda x : datetime.strptime(x, '%Y%m%d').strftime('%Y-%V'))

rider_cancelled_week = rider_cancelled.groupby(['captainid','week'],as_index=False).agg({'net_rides':'sum',
                                                                  'accepted_pings':'sum',
                                                                  'rider_busy_pings':'sum',
                                                                  'rider_rejected_pings':'sum',
                                                                  'rider_cancelled_pings':'sum'})


rider_cancelled_week.head()

Unnamed: 0,captainid,week,net_rides,accepted_pings,rider_busy_pings,rider_rejected_pings,rider_cancelled_pings
0,5737c6aeddbec2203f733176,2021-01,0,1,0,0,0
1,5737c6aeddbec2203f733176,2021-03,10,11,0,0,0
2,5737c6bfddbec2203f73320c,2021-01,4,9,1,0,3
3,5737c6bfddbec2203f73320c,2021-02,0,4,0,0,3
4,5737c6bfddbec2203f73320c,2021-03,0,2,1,0,2


In [13]:
### Cancel Rate Data

cancel_rate_1 = pd.read_csv('Cancel_Jan4.csv')
cancel_rate_1['week'] = '2021-01'

cancel_rate_2 = pd.read_csv('Cancel_Jan11.csv')
cancel_rate_2['week'] = '2021-02'

cancel_rate_3 = pd.read_csv('Cancel_Jan18.csv')
cancel_rate_3['week'] = '2021-03'

cancel_rate_4 = pd.read_csv('Cancel_Jan25.csv')
cancel_rate_4['week'] = '2021-04'

cancel_rate_5 = pd.read_csv('Cancel_Feb1.csv')
cancel_rate_5['week'] = '2021-05'

cancel_rate_6 = pd.read_csv('Cancel_Feb8.csv')
cancel_rate_6['week'] = '2021-06'

cancel_rate_7 = pd.read_csv('Cancel_Feb15.csv')
cancel_rate_7['week'] = '2021-07'

cancel_rate_8 = pd.read_csv('Cancel_Feb22.csv')
cancel_rate_8['week'] = '2021-08'

cancel_rate_9 = pd.read_csv('Cancel_Mar1.csv')
cancel_rate_9['week'] = '2021-09'

cancel_rate_10 = pd.read_csv('Cancel_Mar8.csv')
cancel_rate_10['week'] = '2021-10'

cancel_rate_11 = pd.read_csv('Cancel_Mar15.csv')
cancel_rate_11['week'] = '2021-11'

cancel_rate_12 = pd.read_csv('Cancel_Mar22.csv')
cancel_rate_12['week'] = '2021-12'

cancel_rate = pd.concat([cancel_rate_1,cancel_rate_2,cancel_rate_3,cancel_rate_4,cancel_rate_5,cancel_rate_6,
                        cancel_rate_7,cancel_rate_8,cancel_rate_9,cancel_rate_10,cancel_rate_11,cancel_rate_12])

cancel_rate = cancel_rate.groupby(['captain_id','week'],as_index=False).agg({'Net':'sum',
                                                                              'Captain-induced CC Pings':'sum'
                                                                             })
cancel_rate = cancel_rate[['captain_id','Net','Captain-induced CC Pings','week']]
cancel_rate

Unnamed: 0,captain_id,Net,Captain-induced CC Pings,week
0,5737c6aeddbec2203f733176,0,0,2021-01
1,5737c6aeddbec2203f733176,10,0,2021-03
2,5737c6bfddbec2203f73320c,4,0,2021-01
3,5737c6bfddbec2203f73320c,0,0,2021-02
4,5737c6bfddbec2203f73320c,0,0,2021-03
...,...,...,...,...
1348690,60601bf6193b5c265c0322c4,2,0,2021-12
1348691,606025ae2bda295488da5909,0,0,2021-12
1348692,60602aa2193b5c9820032b32,0,0,2021-12
1348693,60602be559675b800780edc8,0,0,2021-12


In [14]:
final_data = rides_data.merge(rider_cancelled_week, how='left', left_on=['rider','week'], right_on=['captainid','week'])
final_data = final_data.merge(cancel_rate, how='left', left_on=['rider','week'], right_on=['captain_id','week'])
final_data = final_data.merge(captain_ltr, how='left', left_on=['rider','week'], right_on=['captainid','week'])

final_data['cancel'] = (final_data['Captain-induced CC Pings']+final_data['rider_cancelled_pings'])/final_data['accepted_pings'] 
final_data['cancel'] = final_data['cancel'].fillna(0)

final_data['total_pings'] = final_data['accepted_pings']+final_data['rider_busy_pings']+final_data['rider_rejected_pings']
final_data['APR'] = final_data['accepted_pings']/final_data['total_pings']

final_data

Unnamed: 0,rider,order_week,wdays,all_service_rides,rating,week,captainid_x,net_rides,accepted_pings,rider_busy_pings,...,rider_cancelled_pings,captain_id,Net,Captain-induced CC Pings,captainid_y,LTR,ltr_bucket,cancel,total_pings,APR
0,5dd3ecd81f60ef2568f1e1d8,2021-03-22,1,4,0.0,2021-12,5dd3ecd81f60ef2568f1e1d8,4.0,5.0,5.0,...,0.0,,,,5dd3ecd81f60ef2568f1e1d8,1964.0,1200+,0.00000,64.0,0.078125
1,6054406203f2a4e7e21adcda,2021-03-15,2,9,4.5,2021-11,6054406203f2a4e7e21adcda,9.0,15.0,29.0,...,4.0,,,,,,,0.00000,57.0,0.263158
2,5facd90c90d77918ccb94b67,2021-02-22,2,2,5.0,2021-08,5facd90c90d77918ccb94b67,2.0,8.0,73.0,...,2.0,,,,5facd90c90d77918ccb94b67,21.0,6-50,0.00000,114.0,0.070175
3,5e351b799fa82fa185ca47a9,2021-02-01,6,73,4.2,2021-05,5e351b799fa82fa185ca47a9,73.0,84.0,63.0,...,1.0,5e351b799fa82fa185ca47a9,25.0,1.0,5e351b799fa82fa185ca47a9,399.0,250-550,0.02381,174.0,0.482759
4,5af800ff1d97d8126f26cf94,2021-01-18,3,22,4.4,2021-03,5af800ff1d97d8126f26cf94,22.0,25.0,21.0,...,1.0,5af800ff1d97d8126f26cf94,22.0,0.0,5af800ff1d97d8126f26cf94,153.0,100-250,0.04000,46.0,0.543478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1513670,5dd01c120856f14618329e31,2021-02-22,1,1,0.0,2021-08,5dd01c120856f14618329e31,1.0,2.0,18.0,...,0.0,,,,5dd01c120856f14618329e31,70.0,50-100,0.00000,30.0,0.066667
1513671,5fec6307ee2e9c7d5f81f417,2021-02-08,1,1,0.0,2021-06,5fec6307ee2e9c7d5f81f417,1.0,1.0,0.0,...,0.0,5fec6307ee2e9c7d5f81f417,0.0,0.0,,,,0.00000,2.0,0.500000
1513672,5cc49d443d65ca5e2564b4c3,2021-01-04,1,1,5.0,2021-01,5cc49d443d65ca5e2564b4c3,1.0,1.0,4.0,...,0.0,,,,5cc49d443d65ca5e2564b4c3,16.0,6-50,0.00000,12.0,0.083333
1513673,5f430d70979c25f68c039f5f,2021-02-15,1,1,0.0,2021-07,5f430d70979c25f68c039f5f,1.0,2.0,18.0,...,0.0,,,,5f430d70979c25f68c039f5f,43.0,6-50,0.00000,21.0,0.095238


In [None]:
#final_data = final_data[final_data['week']>='2021-06']

In [None]:
#final_data[final_data['rider']=='5c5072cb976854537ab1bb81']

In [15]:
final_data.isna().sum()

rider                            0
order_week                       0
wdays                            0
all_service_rides                0
rating                           0
week                             0
captainid_x                  65605
net_rides                    65605
accepted_pings               65605
rider_busy_pings             65605
rider_rejected_pings         65605
rider_cancelled_pings        65605
captain_id                  462112
Net                         462112
Captain-induced CC Pings    462112
captainid_y                 173623
LTR                         173623
ltr_bucket                  173623
cancel                           0
total_pings                  65605
APR                          65605
dtype: int64

In [16]:
final_data['ltr_bucket'] = final_data['ltr_bucket'].fillna('01-05')
final_data.isna().sum()

rider                            0
order_week                       0
wdays                            0
all_service_rides                0
rating                           0
week                             0
captainid_x                  65605
net_rides                    65605
accepted_pings               65605
rider_busy_pings             65605
rider_rejected_pings         65605
rider_cancelled_pings        65605
captain_id                  462112
Net                         462112
Captain-induced CC Pings    462112
captainid_y                 173623
LTR                         173623
ltr_bucket                       0
cancel                           0
total_pings                  65605
APR                          65605
dtype: int64

In [17]:
final_data.to_csv('Captain_Levels_Weekly_Data.csv',index=False)

In [18]:
final_data.week.unique()

array(['2021-12', '2021-11', '2021-08', '2021-05', '2021-03', '2021-10',
       '2021-04', '2021-06', '2021-07', '2021-02', '2021-13', '2021-09',
       '2021-01', '2020-53'], dtype=object)

In [None]:
#final_data['cancel_count'] = final_data['rider_cancelled_pings']+final_data['Captain-induced CC Pings']
#final_data

In [34]:
#final_data[final_data['week']=='2021-12'].groupby(['ltr_bucket'])['cancel_count'].describe(percentiles=[0.60,0.70,0.80,0.85,0.90,0.95,0.99])

In [None]:
### Monthly Data Levels :

In [19]:
final_data = pd.read_csv('Captain_Levels_Weekly_Data.csv')

In [20]:
input_data = pd.read_csv('captain_levels_input_weekly.csv')
input_data

Unnamed: 0,ltr_bucket,rating,all_service_rides,APR,cancel,active_days,priority,Captain Level
0,01-05,0.0,0,0.0,1.0,0,8,Grey
1,6-50,4.0,3,0.5,0.2,5,7,Blue
2,50-100,4.1,6,0.55,0.14,10,6,Bronze
3,100-250,4.2,15,0.6,0.17,15,5,Silver
4,250-550,4.3,25,0.65,0.15,18,4,Gold
5,550-800,4.4,35,0.7,0.12,20,3,Diamond
6,800-1200,4.5,50,0.75,0.1,22,2,Platinum
7,1200+,4.6,75,0.8,0.1,25,1,Elite


In [21]:
### Including all parameters :


data_sql_query = ps.sqldf("select lt.*, ft.priority from final_data lt left join input_data ft on lt.all_service_rides >= ft.all_service_rides and lt.rating >= ft.rating and lt.APR >= ft.APR and lt.cancel <= ft.cancel")

max_p_check =  data_sql_query.merge(input_data[['ltr_bucket','priority']].rename(columns={'priority':'max_priority'}), how='left', on=['ltr_bucket'])

max_p_check['priority_check'] = max_p_check['priority'] >= max_p_check['max_priority']

max_p_check = max_p_check[max_p_check['priority_check']==True]

priority_table = max_p_check.groupby(['rider','ltr_bucket', 'all_service_rides', 'rating', 'APR','cancel', 'week']).agg({'priority':'min'})
priority_table = priority_table.reset_index()
priority_table['priority'] = priority_table['priority'].fillna(0)
priority_table


Unnamed: 0,rider,ltr_bucket,all_service_rides,rating,APR,cancel,week,priority
0,5737c6aeddbec2203f733176,1200+,10,3.9,1.000000,0.000000,2021-03,8.0
1,5737c6bfddbec2203f73320c,1200+,4,5.0,0.900000,0.333333,2021-01,8.0
2,5737c6bfddbec2203f73320c,1200+,11,0.0,0.880000,0.318182,2021-08,8.0
3,5737c6dbddbec2203f7332fc,1200+,1,0.0,0.500000,0.000000,2021-08,8.0
4,5737c6dbddbec2203f7332fc,1200+,2,5.0,0.666667,0.000000,2021-06,8.0
...,...,...,...,...,...,...,...,...
1448055,606458dd57c969c832947194,01-05,1,5.0,1.000000,0.000000,2021-13,8.0
1448056,6064638f87371d68e3deec88,01-05,3,5.0,0.666667,0.000000,2021-13,8.0
1448057,60646c8557c96908af947d95,01-05,7,3.0,1.000000,0.000000,2021-13,8.0
1448058,6064701d87371d77fedef463,01-05,4,5.0,1.000000,0.000000,2021-13,8.0


In [22]:
#priority_table = priority_table[priority_table['week']=='2021-12']
priority_table.to_csv('priority_table.csv',index=False)

In [39]:
final_data.groupby(['week']).agg({'all_service_rides':'sum','rider':'nunique'})


Unnamed: 0_level_0,all_service_rides,rider
week,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-53,474301,65605
2021-01,1377534,107724
2021-02,1274901,101711
2021-03,1455652,112013
2021-04,1429136,112604
2021-05,1524233,116352
2021-06,1580416,117490
2021-07,1523726,115410
2021-08,1581403,117788
2021-09,1603352,116206


In [40]:
priority_table.groupby(['week']).agg({'all_service_rides':'sum','rider':'nunique'})


Unnamed: 0_level_0,all_service_rides,rider
week,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01,1377534,107724
2021-02,1274900,101710
2021-03,1455652,112013
2021-04,1429136,112604
2021-05,1524231,116350
2021-06,1580407,117489
2021-07,1523724,115408
2021-08,1581402,117787
2021-09,1603350,116205
2021-10,1552930,113225


In [41]:
captains_count = priority_table.groupby(['priority','week'],as_index=False).agg({'rider':'nunique','all_service_rides':'sum'})
captains_count = captains_count.rename(columns={'rider':'captains_count'})
captains_count

Unnamed: 0,priority,week,captains_count,all_service_rides
0,1.0,2021-01,173,17649
1,1.0,2021-02,133,13074
2,1.0,2021-03,179,18337
3,1.0,2021-04,159,16252
4,1.0,2021-05,182,18298
...,...,...,...,...
99,8.0,2021-09,75649,609908
100,8.0,2021-10,73896,602564
101,8.0,2021-11,77584,635904
102,8.0,2021-12,78710,637949


In [42]:
priority_table.rider.value_counts()

5fd3e8d39ffc6a754f1101fe    13
5d68da4da24c96105e2dd148    13
5d596cac55fbf50d45fc4b4f    13
5c061549d76a02356e94133c    13
5fd58d2269bf68b24cc88512    13
                            ..
5c0e8e521e5c197a867342db     1
5d27249b16609737fa47692f     1
60376488f5d36b3fd1f7e13d     1
5f3fbc087e96724791006fa8     1
5da6f980941a7d1c51e8ec2b     1
Name: rider, Length: 342867, dtype: int64

In [None]:
priority_table[priority_table['week']=='2021-07']['ltr_bucket'].nunique()

In [43]:
captain_levels

NameError: name 'captain_levels' is not defined

In [None]:

captain_levels =  captains_count.merge(input_data[['priority','Captain Level']], how='left', on='priority')
captain_levels.pivot_table(index=['priority','Captain Level'], columns='week', values='captains_count',aggfunc='sum')

In [None]:
captain_levels.pivot_table(index=['priority','Captain Level'], columns='week', values='all_service_rides',aggfunc='sum')


In [None]:
priority_table[priority_table['week']=='2021-12'].groupby(['priority'])['cancel'].describe(percentiles=[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95])


In [None]:
input_data

In [None]:
priority_table['rides_bucket'] = priority_table['all_service_rides'].apply(lambda x : '0' if x <= 2 else 
                                                   ('3' if (x>=3) and (x<=6) 
                                                    else('6' if (x>7) and (x<=15) else
                                                        ('15' if (x>16) and (x<=25) else
                                                        ('25' if (x>26) and (x<=35) else 
                                                        (('35' if (x>36) and (x<=50) else 
                                                          (('50' if (x>51) and (x<=75) else 
                                                          '75')))))))))
print(priority_table.head())

In [None]:
temp =  priority_table.merge(final_data[['rider','week','cancel_count']], how='left', on=['rider','week']).fillna(0)
temp

In [None]:
temp[temp['week']=='2021-12'].groupby(['rides_bucket'])['cancel_count'].describe(percentiles=[0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95])


In [None]:
priority_table_jan = priority_table[priority_table['week']=='2021-05']
priority_table_feb = priority_table[priority_table['week']=='2021-06']
priority_table_mar = priority_table[priority_table['week']=='2021-07']

In [None]:
priority_table = priority_table.sort_values(by='week')

In [None]:
week_list = priority_table[priority_table['week']>='2021-05']['week'].unique().tolist()
week_list

In [None]:
level_movement = pd.DataFrame()

for i in range(len(week_list)):
    print(week_list[i])
    print(week_list[i+1])
    temp_table = priority_table[priority_table['week']==week_list[i]].merge(priority_table[priority_table['week']==week_list[i+1]][['rider','priority']].rename(columns={'priority':'next_priority'}), how='left', on='rider').groupby(['priority','next_priority']).agg({'rider':'nunique'})
    temp_table = temp_table.reset_index()
    
    temp_table =  temp_table.pivot_table(index='priority', columns='next_priority', values='rider', aggfunc='sum').reset_index()
    temp_table['to_week'] = week_list[i]
    temp_table['from_week'] = week_list[i+1]
    
    level_movement = pd.concat([level_movement,temp_table])
    

In [None]:
level_movement.to_csv('weekly_movement.csv',index=False)

In [None]:
priority_table_feb[priority_table_feb['rider']=='5b65599cf6b6ea3b10e08892']

In [None]:
jan_feb =  priority_table_jan.merge(priority_table_feb[['rider','priority']].rename(columns={'priority':'next_priority'}), how='left', on='rider').groupby(['priority','next_priority']).agg({'rider':'nunique'})

jan_feb = jan_feb.sort_values(by=['priority','next_priority'], ascending = False)
jan_feb = jan_feb.reset_index()

jan_feb.pivot_table(index='priority', columns='next_priority', values='rider', aggfunc='sum').reset_index()

In [None]:
temp = priority_table_jan.merge(priority_table_feb[['rider','priority']].rename(columns={'priority':'next_priority'}), how='left', on='rider')
temp[(temp['next_priority']==7) & (temp['priority']==1)]

In [None]:
jan_feb =  priority_table_feb.merge(priority_table_mar[['rider','priority']].rename(columns={'priority':'next_priority'}), how='left', on='rider').groupby(['priority','next_priority']).agg({'rider':'nunique'})

jan_feb = jan_feb.reset_index()

jan_feb.pivot_table(index='priority', columns='next_priority', values='rider', aggfunc='sum')

In [None]:
temp = priority_table_feb[['rider','priority']].merge(priority_table_mar.rename(columns={'priority':'next_priority'}), how='left', on='rider')
temp[(temp['next_priority']==3) & (temp['priority']==1)]

In [None]:
temp[(temp['next_priority']==3) & (temp['priority']==1)].describe([0.1,0.2,0.25,0.75])

In [None]:
temp[(temp['next_priority']==1) & (temp['priority']==1)].describe([0.1,0.2,0.25,0.75])

In [None]:
### Auto Captain Levels

auto_query = """select captainid, shift, lastridecity
            from datasets.captain_single_view
            where  lower(shift) like '%auto%'"""

auto_captains = pd.read_sql(auto_query, presto_conn)
auto_captains.head()

In [None]:
auto_captains['captainid'].nunique()

In [None]:
len(auto_captains)

In [None]:
auto_captain_rides = auto_captains.merge(rides_data, how='left', left_on='captainid', right_on='rider')
auto_captain_rides.head()

In [None]:
auto_captain_rides.groupby(['week'])['all_service_rides'].describe(percentiles=[0.8,0.85,0.9,0.95,0.99])

In [None]:
auto_captain_ltr = auto_captains.merge(captain_ltr, how='left', on=['captainid'])

auto_captain_ltr.head()

In [None]:
auto_captain_ltr.groupby(['week'])['LTR'].describe(percentiles=[0.8,0.85,0.9,0.95,0.99])

In [None]:
level_movement = pd.DataFrame()

for i in range(len(week_list)):
    print(week_list[i])
    print(week_list[i+1])
    temp_table = priority_table[priority_table['week']==week_list[i]].merge(priority_table[priority_table['week']==week_list[i+1]][['rider','priority']].rename(columns={'priority':'next_priority'}), how='left', on='rider')
    temp_table = temp_table.merge(priority_table[priority_table['week']==week_list[i+2]][['rider','priority']].rename(columns={'priority':'priority_3'}), how='left', on='rider')
    temp_table = temp_table.reset_index()
    temp_table['to_week'] = week_list[i]
    
    level_movement = pd.concat([level_movement,temp_table])
    

In [None]:
temp_df = level_movement[(level_movement['to_week']=='2021-10')]
# temp_df['next_priority'] = temp_df['next_priority'].fillna(8)
temp_df['priority_3'] = temp_df['priority_3'].fillna(8)
temp_df['priority'] = temp_df['priority'].apply(str)
temp_df['next_priority'] = temp_df['next_priority'].apply(str)
temp_df['p1'] = temp_df['priority']  +'_'+ temp_df['next_priority']

temp_df.pivot_table(index='p1', columns='priority_3', values='rider', aggfunc='nunique').reset_index().fillna(0).to_csv('level_movement.csv')


In [None]:
temp_df.ltr_bucket.unique()

In [None]:
temp[(temp['week']=='2021-12')]

In [None]:
temp[(temp['week']=='2021-12')].groupby(['rides_bucket'])['cancel_count'].describe(percentiles=[0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95])


In [None]:
priority_table[(priority_table['week']=='2021-12') & (priority_table['ltr_bucket']=='01-05')]

In [None]:
priority_table[priority_table['rider']=='605888fe581f3783d6e2ccc0']

In [None]:
priority_table[(priority_table['week']=='2021-12') & (priority_table['priority']==1)].describe(percentiles=[0.8,0.85,0.9,0.95])

In [None]:
priority_table.corr()

In [None]:
rf_levels =  temp[temp['week']=='2021-12'].merge(rf_segments[rf_segments['week']=='2021-13'][['rider','Segment']], how='left', on=['rider'])
rf_levels.isna().sum()

In [None]:
rf_levels = rf_levels.fillna('NEW+ZT+B2B')

In [None]:
rf_levels[rf_levels['week']=='2021-12'].pivot_table(index=['Segment'], columns='priority', values='rider', aggfunc='nunique')


In [None]:
rf_levels

In [None]:
temp[temp['week']=='2021-12'].groupby(['priority'])['all_service_rides'].describe(percentiles=[0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.95])