In [107]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pyhive import presto
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")
import my_func

presto_conn = presto.connect(
    host='bi-presto.serving.data.production.internal',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [108]:
start_date = '2022-01-03'
end_date = '2022-01-30'
city = 'Hyderabad'

In [109]:
date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 6]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]
date_list

['2022-01-09', '2022-01-16', '2022-01-23', '2022-01-30']

In [110]:
date_list = ['2022-01-30']
df_segment = pd.DataFrame()
for dt in date_list:
        sd = (pd.to_datetime(dt) - timedelta(27)).strftime('%Y%m%d')
        ed = dt.replace('-', '')
        ct = city
        print(f'fetching segments for {ct} from {sd} to {ed}')
        %run Segments_updated.py '{sd}' '{ed}' '{ct}'
        segment = pd.read_csv('Captain_Segment.csv')
        segment['city'] = ct
        segment['week'] = pd.to_datetime(dt).strftime('%V')
        segment['week'] = segment['week'].apply(pd.to_numeric)
        df_segment = pd.concat([df_segment, segment])
        print(f'done')

fetching segments for Hyderabad from 20220103 to 20220130
done


In [114]:
df_segment.shape

(40681, 17)

In [111]:
df_segment['Segment'].value_counts()

LP_Inter_LO      8257
LP_Intra_LO      5002
MP_D_MO          4141
LP_D_LO          4095
MP_D_HO          3005
LP_D_MO          2438
MP_Intra_MO      1822
HP_D_HO          1818
MP_Inter_MO      1557
LP_Intra_MO      1340
LP_Inter_MO      1212
LP_D_HO           685
MP_Intra_HO       682
MP_Inter_HO       621
HP_D_UHO          613
LP_Inter_HO       395
LP_Intra_HO       324
MP_D_LO           323
HP_Inter_HO       299
HP_Intra_HO       271
MP_Intra_LO       266
MP_Inter_LO       263
MP_D_UHO          240
UHP_D_UHO         211
LP_Inter_UHO      198
LP_D_UHO          186
LP_Intra_UHO      126
MP_Inter_UHO       93
HP_Inter_UHO       92
HP_Intra_UHO       39
MP_Intra_UHO       38
UHP_Intra_UHO      18
UHP_Inter_UHO      11
Name: Segment, dtype: int64

In [113]:
# rf segment

def get_rf(dt):
    
    print("fetching rf segments for : ", dt)
    orders_query = """   select captainid, 
        recency_segment as recency
    from datasets.captain_cu_immutable
    where day = date('{dt}')
               """.format(dt = dt)
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched segment data : ", len(df_lh))

    return df_lh

rf_segment = pd.DataFrame()

# date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
#         if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 6]
# date_list = [x.strftime('%Y-%m-%d') for x in date_list]

date_list = ['2022-01-30']

for day in date_list : 
    segment = get_rf(day)
    segment['week'] = pd.to_datetime(day).strftime('%V')
    segment['week'] = segment['week'].apply(pd.to_numeric)
    rf_segment = pd.concat([rf_segment,segment])

rf_segment.head()

fetching rf segments for :  2022-01-30
 fetched segment data :  1420819


Unnamed: 0,captainid,recency,week
0,60430ece6a8a765ab887f943,RECENT,4
1,5e6096c22d58e2a0cf8e7e2f,INACTIVE,4
2,5d7385be9f130346e2ca145b,INACTIVE,4
3,5c0ff86ba24a6608213a46cb,INACTIVE,4
4,61531575f2602c56502db2fc,INACTIVE,4


In [179]:
# Rides

def get_rides(start_date, end_date, city):
    
    print("fetching rides data for : ", start_date, " to : ", end_date)
    rides_query = """SELECT captain_id,
                            cast(weekday as integer) as weekday,
                            cast(hour as integer) as hour,
                            date_format(date_parse(order_date,'%Y-%m-%d'),'%Y%m%d') as order_date,
                            sum(case when service_obj_service_name = 'Link' then 1 else 0 end) as link_orders,
                            sum(case when service_obj_service_name in ('Delivery', 'Zomato') then 1 else 0 end) as delivery_orders,
                            count(*) as total_orders,
                            avg(customer_feedback_rating) as rating
                        from orders.order_logs_snapshot
                        where order_date >= '{sd}'
                            and order_date <= '{ed}'
                            and order_status = 'dropped'
                            and spd_fraud_flag != true
                            and lower(service_obj_service_name) not like '%auto%'
                            and lower(order_type) not like '%auto%'
                            and service_obj_city_display_name in ('{ct}')
                        group by 1,2,3,4 """.format(sd=start_date, ed= end_date, ct = city)
    
    #print(rides_query)
    tmp = pd.read_sql(rides_query, presto_conn)
    
    print(" fetched rides for captains", len(tmp))

    return tmp

df_rides = pd.DataFrame()

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 0]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

for day in date_list : 
    rides = get_rides(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'), city)
    rides['week'] = pd.to_datetime(day).strftime('%V')
    rides['week'] = rides['week'].apply(pd.to_numeric)
    df_rides = pd.concat([df_rides, rides])

df_rides.head()

fetching rides data for :  2022-01-03  to :  2022-01-09
 fetched rides for captains 247902
fetching rides data for :  2022-01-10  to :  2022-01-16
 fetched rides for captains 212057
fetching rides data for :  2022-01-17  to :  2022-01-23
 fetched rides for captains 222155
fetching rides data for :  2022-01-24  to :  2022-01-30
 fetched rides for captains 239939


Unnamed: 0,captain_id,weekday,hour,order_date,link_orders,delivery_orders,total_orders,rating,week
0,61b0d5d30a53221097f8a913,3.0,1,20220105,1,0,1,,1
1,616b1f07fbce6601e59a6e7f,6.0,2,20220108,1,0,1,,1
2,6168fcfefbce66e6bc98e1b6,6.0,2,20220108,1,0,1,5.0,1
3,5f0d38c7f894c129526c79ef,6.0,1,20220108,2,0,2,5.0,1
4,5cf965bcca6e29211174cd23,7.0,5,20220109,2,0,2,5.0,1


In [141]:
# Login hours

def get_login_hours(start_date, end_date, city):
    print("fetching Login hours for : ",start_date," to ", end_date)
    orders_query = """   SELECT 
            userid as rider, 
            yyyymmdd,
            day_of_week(DATE(date_parse(yyyymmdd,'%Y%m%d'))) as day_of_week,
            cast(sum(duration) as double)/cast((60*60*1000) as double) as login_hours,
            SUM(CASE WHEN quarter_hour >= '0600' and quarter_hour <= '1159' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS MORNING_DURATION,
            SUM(CASE WHEN quarter_hour >= '1200' and quarter_hour <= '1659' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS AFTERNOON_DURATION,
            SUM(CASE WHEN quarter_hour >= '1700' and quarter_hour <= '2359' then DURATION ELSE 0 END)/cast((60*60*1000) as double) AS EVENING_DURATION
        FROM hive.datasets.captain_login_hours
        WHERE yyyymmdd >= '{sd}' AND yyyymmdd <= '{ed}'
        AND status in ('2','3','6','7','8','10')
        AND userid in (SELECT captainId from datasets.captain_single_view WHERE (lower(registeredcity) in ('{ct}') or lower(lastridecity) in ('{ct}')) AND activationdate is not null)
        GROUP BY 1,2,3      
               """.format(sd = start_date.replace('-',''), ed = end_date.replace('-',''), ct = city.lower())
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched Login hours", len(df_lh))

    return df_lh

df_login_hours = pd.DataFrame()

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 0]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

for day in date_list : 
    login_hours = get_login_hours(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'), city)
    x = pd.to_datetime(day).strftime('%V')
    login_hours['week'] = x
    login_hours['week'] = login_hours['week'].apply(pd.to_numeric)
    df_login_hours = pd.concat([df_login_hours,login_hours])

df_login_hours.head()

fetching Login hours for :  2022-01-03  to  2022-01-09
 fetched Login hours 226356
fetching Login hours for :  2022-01-10  to  2022-01-16
 fetched Login hours 192632
fetching Login hours for :  2022-01-17  to  2022-01-23
 fetched Login hours 220258
fetching Login hours for :  2022-01-24  to  2022-01-30
 fetched Login hours 246314


Unnamed: 0,rider,yyyymmdd,day_of_week,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION,week
0,5f44c9b07f36b6b5232a53ff,20220105,3,6.286111,0.0,2.86,3.343333,1
1,5f44c9b07f36b6b5232a53ff,20220104,2,1.744167,0.663889,0.077778,1.0025,1
2,5f44c9b07f36b6b5232a53ff,20220106,4,7.177222,1.207778,3.3775,2.591944,1
3,5f44c9b07f36b6b5232a53ff,20220107,5,1.478056,0.0,1.478056,0.0,1
4,5f44c9b07f36b6b5232a53ff,20220108,6,3.895,0.699167,1.14,2.055833,1


In [191]:
# LTR and Activation Date

def get_ltr(city):
    print("fetching LTR for : ",city)
    orders_query = """   SELECT captainId,
                                registrationdate, 
                                activationdate,
                                date_diff('day',date(activationdate),current_date) as days_on_platform,
                                lastridedate,
                                lifetimerides as ltr,
                                preferredtimebucket,
                                case when weekdaycaptain = true then 'weekday'
                                    when weekendcaptain = true then 'weekend'
                                    else 'undecided' end as prefferedday
                            from datasets.captain_single_view 
                            WHERE (lower(registeredcity) in ('{ct}') or lower(lastridecity) in ('{ct}')) 
                                AND activationdate is not null
           
               """.format(ct = city.lower())
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched Login hours", len(df_lh))

    return df_lh


df_ltr = get_ltr(city)

df_ltr.head()

fetching LTR for :  Hyderabad
 fetched Login hours 432632


Unnamed: 0,captainId,registrationdate,activationdate,days_on_platform,ltr,preferredtimebucket,prefferedday
0,5bfe710fd76a02356e923fb7,2018-11-28,2018-11-30,1169,3,EVENING,undecided
1,609e8774b3eb983d77a734dd,2021-11-14,2021-11-14,89,11,EVENING,weekend
2,5cdfe31925ee3218d4cde492,2020-12-05,2020-12-08,430,7,EVENING,weekend
3,5d68c6a2a24c96105e2da230,2021-08-27,2021-09-16,148,149,AFTERNOON,weekend
4,5d3dcbf93860423ec785bbb8,2019-07-28,2022-01-23,19,0,MORNING,undecided


In [181]:
# Pings

def get_pings():
    #print("fetching poc segments for : ",dt)
    orders_query = """   select 
        captain_id,
        yyyymmdd,
        (sum(case when event_type in ('accepted') then 1 else 0 end)
            + sum(case when event_type in ('rider_busy') then 1 else 0 end)
            + sum(case when event_type in ('rider_reject') then 1 else 0 end)) as Total_Pings,
        (count(distinct(case when event_type='dropped' then order_id end))+
            (count(distinct(case when event_type='rider_busy' then order_id end)))+
            (count(distinct(case when event_type='rider_reject' then order_id end)))+
            (count(distinct(case when event_type='rider_cancelled' then order_id end)))+
            (count(distinct(case when event_type='customer_cancelled' and cancel_reason in ('Asked to change payment mode','Asked to pay extra','Drop location denied','Asked to cancel and take offline ride','Taking longer than expected') then order_id end)))) as True_total_pings,
        (count(distinct(case when event_type='accepted' then order_id end))) as accepted_pings,
        (count(distinct(case when event_type='dropped' then order_id end))) as dropped_orders,
        
        (sum(case when service_obj_service_name = 'Link' and event_type in ('accepted') then 1 else 0 end)
            + sum(case when service_obj_service_name = 'Link' and event_type in ('rider_busy') then 1 else 0 end)
            + sum(case when service_obj_service_name = 'Link' and event_type in ('rider_reject') then 1 else 0 end)) as Total_Pings_Link,
        (count(distinct(case when service_obj_service_name = 'Link' and event_type='dropped' then order_id end))+
            (count(distinct(case when service_obj_service_name = 'Link' and event_type='rider_busy' then order_id end)))+
            (count(distinct(case when service_obj_service_name = 'Link' and event_type='rider_reject' then order_id end)))+
            (count(distinct(case when service_obj_service_name = 'Link' and event_type='rider_cancelled' then order_id end)))+
            (count(distinct(case when service_obj_service_name = 'Link' and event_type='customer_cancelled' and cancel_reason in ('Asked to change payment mode','Asked to pay extra','Drop location denied','Asked to cancel and take offline ride','Taking longer than expected') then order_id end)))) as True_total_pings_link,
        (count(distinct(case when service_obj_service_name = 'Link' and event_type='accepted' then order_id end))) as accepted_pings_link,
        (count(distinct(case when service_obj_service_name = 'Link' and event_type='dropped' then order_id end))) as dropped_orders_link,
        
        (sum(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type in ('accepted') then 1 else 0 end)
            + sum(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type in ('rider_busy') then 1 else 0 end)
            + sum(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type in ('rider_reject') then 1 else 0 end)) as Total_Pings_Delivery,
        (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='dropped' then order_id end))+
            (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='rider_busy' then order_id end)))+
            (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='rider_reject' then order_id end)))+
            (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='rider_cancelled' then order_id end)))+
            (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='customer_cancelled' and cancel_reason in ('Asked to change payment mode','Asked to pay extra','Drop location denied','Asked to cancel and take offline ride','Taking longer than expected') then order_id end)))) as True_total_pings_delivery,
        (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='accepted' then order_id end))) as accepted_pings_delivery,
        (count(distinct(case when service_obj_service_name in ('Delivery', 'Zomato') and event_type='dropped' then order_id end))) as dropped_orders_delivery

    from orders.order_logs_immutable as od
    where yyyymmdd between '20220103' and '20220130'
        and service_obj_service_name not like '%Auto%'
        and order_type not like '%auto%'
        and service_obj_city_display_name in ('Hyderabad')
    group by 1,2     
               """.format(sd = dt.replace('-',''))
    #print(orders_query)
    df_lh = pd.read_sql(orders_query, presto_conn)
    
    print(" fetched segment data : ", len(df_lh))

    return df_lh

df_pings = get_pings()

df_pings.head()

 fetched segment data :  318520


Unnamed: 0,captain_id,yyyymmdd,Total_Pings,True_total_pings,accepted_pings,dropped_orders,Total_Pings_Link,True_total_pings_link,accepted_pings_link,dropped_orders_link,Total_Pings_Delivery,True_total_pings_delivery,accepted_pings_delivery,dropped_orders_delivery
0,619cc3a3b6f39d1ee2bcfa34,20220130,17,14,15,11,17,14,15,11,0,0,0,0
1,61a387a9b44d8f66ae650a82,20220126,21,19,18,11,21,19,18,11,0,0,0,0
2,5c5d475ff2edc7336751b4e9,20220128,7,6,6,5,6,5,5,4,1,1,1,1
3,5d352ac3f8fc6575c0895fd9,20220126,35,34,9,4,35,34,9,4,0,0,0,0
4,5bbe14617e7bc30a383a5284,20220124,43,43,6,6,0,0,0,0,43,43,6,6


In [304]:
def get_daily_weekly_incentive_details(sd, ed, city):
    
    query="""
        select 
            riderid, yyyymmdd,
            sum(case when transactiontype = 'specialIncentive' then cast(amount as double) else 0.0 end) as total_incentive,
            sum(case when transactiontype = 'specialIncentive' and incentiveType='Weekly Fixed' then cast(amount as double) else 0.0 end) as weekly_incentive,
            sum(case when transactiontype = 'specialIncentive' and incentiveType='Daily' then cast(amount as double) else 0.0 end) as daily_incentive,
            --sum(case when serviceType = '572e29b0116b5db3057bd821' then cast(orderSlabData_orderEarning as  double) else  0.0 end) as  orderSlabData_orderEarning,
            sum(case when serviceType = '572e29b0116b5db3057bd821' then cast(totalEarning as double) else  0.0 end) as order_earning
            from(
                select
                    riderid, city,
                    date_trunc('week', date_parse(yyyymmdd, '%Y%m%d')) as week,
                    yyyymmdd,
                    incentiveData_startDate, incentiveName,
                    orderSlabData_orderEarning, totalEarning,
                    tincentiveidl as incentiveid, incentivestage, 
                    amount, incentiveData_ordersIncentiveSlab,
                    subincentiveid, transactiontype, incentiveType,
                    servicetype,
                    row_number() over(partition by raw_tbl._id order by updated_epoch desc) as row
                from raw.mongodb_rapidopayroll_riderspaymentnew_immutable as raw_tbl
                where yyyymmdd >= '{sd}'
                    and yyyymmdd <= '{ed}'
                    --and transactiontype ='specialIncentive'
                    and status ='success'
                    and city = '{ct}'
                    --and servicedetailid in ('5bed473f1278885df4ea9d57', --'574013f14fdf4798208bba26',  '57370b61a6855d70057417d1')
                )
            where row=1
            group by 1, 2
    """.format(sd=sd.replace('-', ''), ed=ed.replace('-', ''), ct=city)

    incentive_df = pd.read_sql_query(query, presto_conn)

    return incentive_df

date_list = [pd.to_datetime(start_date) + timedelta(days=x) for x in range((pd.to_datetime(end_date)-pd.to_datetime(start_date)).days + 1)
        if (pd.to_datetime(start_date) + timedelta(days=x)).weekday() == 6]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]

df_incentive = pd.DataFrame()

for day in date_list : 
    incentive = get_daily_weekly_incentive_details(day, (pd.to_datetime(day) + timedelta(6)).strftime('%Y-%m-%d'), city)    
    df_incentive = pd.concat([df_incentive,incentive])

df_incentive.head()

Unnamed: 0,riderid,yyyymmdd,total_incentive,weekly_incentive,daily_incentive,order_earning
0,5fce7b570141472d021187ef,20220115,70.0,0.0,70.0,844.8985
1,5bdf2b6a65784d23919903b1,20220112,215.0,140.0,75.0,462.3598
2,61082e7adf98a5540c88cda3,20220110,170.0,0.0,170.0,0.0
3,5f715365207b1e350746e34d,20220109,0.0,0.0,0.0,0.0
4,5c5980f3f2edc733674e4e6a,20220110,0.0,0.0,0.0,330.3355


In [308]:
# df_segment.to_csv('segment_for_persona.csv', index=False)
# rf_segment.to_csv('rf_segment_for_persona.csv', index=False)
# df_rides.to_csv('rides_for_persona.csv', index=False)
# df_pings.to_csv('pings_for_persona.csv', index=False)
# df_login_hours.to_csv('login_hours_for_persona.csv', index=False)
# df_ltr.to_csv('ltr_for_persona.csv', index=False)
# df_incentive.to_csv('incentive_for_persona.csv', index=False)

In [4]:
df_segment = pd.read_csv('segment_for_persona.csv')
rf_segment = pd.read_csv('rf_segment_for_persona.csv')
df_rides = pd.read_csv('rides_for_persona.csv')
df_login_hours = pd.read_csv('login_hours_for_persona.csv')
df_ltr = pd.read_csv('ltr_for_persona.csv')
df_incentive = pd.read_csv('incentive_for_persona.csv')

# Rating

In [117]:
# PERCENTILE DISTRIBUTION OF RATING

df_tmp = df_rides.drop(['week','weekday','hour','order_date','link_orders','delivery_orders','total_orders'], axis=1)
df_tmp['rating'] = df_tmp['rating'].fillna(5)
df_tmp = df_tmp.groupby('captain_id').agg('mean').reset_index()
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_tmp.rename(columns={"captain_id":"RIDER"}), how = 'inner', on = 'RIDER')
df_tmp['rating'].describe([0.25, 0.5, 0.60, 0.75, 0.80, 0.9])

count    36606.000000
mean         4.526142
std          0.514927
min          0.000000
25%          4.363636
50%          4.592105
60%          4.680556
75%          4.888889
80%          5.000000
90%          5.000000
max          5.000000
Name: rating, dtype: float64

In [65]:
percentile_75 = np.percentile(df_tmp['rating'], 75)
percentile_50 = np.percentile(df_tmp['rating'], 50)
percentile_25 = np.percentile(df_tmp['rating'], 25)
df_tmp['rating_category'] = df_tmp['rating'].apply(lambda x : 'Perfectionist' if x > percentile_75
                                                        else 'Above Average' if x > percentile_50
                                                        else 'Average' if x > percentile_25
                                                        else 'Below Average')
df_tmp['rating_category'].value_counts()

Below Average    7899
Average          7899
Above Average    7896
Perfectionist    7888
Name: rating_category, dtype: int64

In [66]:
x = df_tmp.pivot_table(index='Segment', columns = 'rating_category', values = 'RIDER', aggfunc = 'nunique').reset_index()
y = df_tmp.pivot_table(index='Segment', values = 'RIDER', aggfunc = 'nunique').drop('Segment', axis=1).reset_index()
z = x.merge(y, how = 'inner', on = 'Segment')
z['Above Average'] = round(z['Above Average']/z['RIDER']*100.0,1)
z['Average'] = round(z['Average']/z['RIDER']*100.0,1)
z['Below Average'] = round(z['Below Average']/z['RIDER']*100.0,1)
z['Perfectionist'] = round(z['Perfectionist']/z['RIDER']*100.0,1)
z

Unnamed: 0,Segment,Above Average,Average,Below Average,Perfectionist,RIDER
0,HP_D_HO,26.8,45.1,23.0,5.1,2118
1,HP_D_UHO,25.6,48.3,19.9,6.2,720
2,LP_D_HO,29.7,18.3,22.7,29.3,1034
3,LP_D_LO,24.6,18.3,25.3,31.8,7188
4,LP_D_MO,25.9,23.8,26.2,24.1,3757
5,LP_D_UHO,22.8,12.9,10.8,53.5,372
6,LP_Inter_HO,1.5,6.0,23.9,68.7,67
7,LP_Inter_LO,5.6,4.5,22.5,67.4,2333
8,LP_Inter_MO,7.6,6.7,23.7,62.1,224
9,LP_Inter_UHO,,,13.0,87.0,23


In [123]:
x = df_tmp.groupby('Segment').apply(lambda x : x.describe([0.50,0.75,0.95])).reset_index()
x[x.level_1.isin(['mean','50%', '75%', '95%'])].pivot(index='Segment', columns='level_1')

Unnamed: 0_level_0,rating,rating,rating,rating
level_1,50%,75%,95%,mean
Segment,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
HP_D_HO,4.416667,4.541667,4.8,4.399197
HP_D_UHO,4.422779,4.546327,4.840045,4.423069
LP_D_HO,4.4375,4.728147,5.0,4.343573
LP_D_LO,4.5,4.857143,5.0,4.397706
LP_D_MO,4.433333,4.714286,5.0,4.379059
LP_D_UHO,4.5,5.0,5.0,4.393179
LP_Inter_HO,5.0,5.0,5.0,4.244509
LP_Inter_LO,5.0,5.0,5.0,4.39241
LP_Inter_MO,5.0,5.0,5.0,4.262689
LP_Inter_UHO,5.0,5.0,5.0,4.663399


In [206]:
# Rating calculation based on rides percentile distribution

df_tmp = df_rides.drop(['week','weekday','hour','order_date','link_orders','delivery_orders'], axis=1)
#df_tmp['rating'] = df_tmp['rating'].fillna(5)
df_tmp = df_tmp.groupby('captain_id').agg({'rating':'mean','total_orders':'sum'}).reset_index()
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_tmp.rename(columns={"captain_id":"RIDER"}), how = 'inner', on = 'RIDER')
df_tmp = df_tmp.sort_values(by='rating', ascending = True)
df_tmp['cum_percent'] = 100*(df_tmp.total_orders.cumsum() / df_tmp.total_orders.sum())
percentile_25 = df_tmp[df_tmp['cum_percent']<=25]['rating'].mean()
percentile_50 = df_tmp[(df_tmp['cum_percent']>25) & (df_tmp['cum_percent']<=50)]['rating'].mean()
percentile_75 = df_tmp[(df_tmp['cum_percent']>50) & (df_tmp['cum_percent']<=75)]['rating'].mean()
df_tmp['rating_category'] = df_tmp['rating'].apply(lambda x : 'perfectionist' if x > percentile_75
                                                            else 'above average' if x > percentile_50
                                                            else 'average' if x > percentile_25
                                                            else 'below average')
x = df_tmp.pivot_table(index='Segment', columns = 'rating_category', values = 'RIDER', aggfunc = 'nunique')
x = x.div(x.sum(axis=1), axis=0)*100.0
#x.reset_index(inplace=True)
thresh = 50   #percent
x['max_value'] = x.max(axis = 1)
x['rating_category'] = x.idxmax(axis = 1)
# x['final_affinity'] = x.apply(lambda x : x.rating_category if x.max_value > thresh
#                                      else 'undecided', axis = 1)
# x[['final_affinity']]
x['rating_category']

Segment
HP_D_HO          perfectionist
HP_D_UHO         above average
HP_Inter_HO      perfectionist
HP_Inter_UHO     perfectionist
HP_Intra_HO      perfectionist
HP_Intra_UHO     perfectionist
LP_D_HO          perfectionist
LP_D_LO          perfectionist
LP_D_MO          perfectionist
LP_D_UHO         perfectionist
LP_Inter_HO      perfectionist
LP_Inter_LO      perfectionist
LP_Inter_MO      perfectionist
LP_Inter_UHO     below average
LP_Intra_HO      perfectionist
LP_Intra_LO      perfectionist
LP_Intra_MO      perfectionist
LP_Intra_UHO     perfectionist
MP_D_HO                average
MP_D_LO          perfectionist
MP_D_MO          perfectionist
MP_D_UHO               average
MP_Inter_HO      perfectionist
MP_Inter_LO      perfectionist
MP_Inter_MO      perfectionist
MP_Inter_UHO     perfectionist
MP_Intra_HO      perfectionist
MP_Intra_LO      perfectionist
MP_Intra_MO      perfectionist
MP_Intra_UHO     perfectionist
UHP_D_UHO        above average
UHP_Inter_UHO    perfectionist


# CU AFFINITY

In [142]:
# CU affinity

df_tmp = df_rides.drop(['rating','week','weekday','hour','order_date'], axis=1)
df_tmp = df_tmp.groupby('captain_id').agg('sum').reset_index()
df_tmp['cu_affinity'] = df_tmp.apply(lambda x : 'others' if (x.link_orders+x.delivery_orders) == 0
                                            else 'link' if x.link_orders/(x.link_orders+x.delivery_orders) > 0.67
                                            else 'delivery' if x.delivery_orders/(x.link_orders+x.delivery_orders) > 0.67
                                            else 'cu', axis=1)
df_tmp.shape

(39609, 5)

In [143]:
df_tmp['cu_affinity'].value_counts()

link        33890
delivery     2809
cu           1654
others       1256
Name: cu_affinity, dtype: int64

In [144]:
df_tmp[df_tmp['cu_affinity']=='cu']

Unnamed: 0,captain_id,link_orders,delivery_orders,total_orders,cu_affinity
28,585a607008b1b926573edeaa,79,72,182,cu
55,592682e6e8f538611ab2aca1,19,19,43,cu
84,59cf79a3b3d1703d0fd4d8db,1,2,3,cu
88,59d119191c44ce883776dfb4,5,5,11,cu
91,59d25abeab2d7b83375b880b,14,12,32,cu
...,...,...,...,...,...
39396,61f129d36916b81fc7abd293,8,8,23,cu
39397,61f12bcbc536c67bcc6e2e80,12,6,19,cu
39435,61f19f506916b85b54ac2df1,4,6,10,cu
39447,61f22bcbebef827ab22690a0,12,7,20,cu


# Time Bucket Login Affinity

In [128]:
df_login_hours.head()

Unnamed: 0,rider,yyyymmdd,day_of_week,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION,week
0,61503275f2602cdbbc2bb188,20220104,2,0.131111,0.131111,0.0,0.0,1
1,5fd9cc2b08e6158003caa6c8,20220104,2,1.070833,0.011944,0.405556,0.653333,1
2,5fd9cc2b08e6158003caa6c8,20220105,3,1.594444,0.0,0.309167,1.285278,1
3,5fd9cc2b08e6158003caa6c8,20220106,4,3.575,2.170278,0.0,0.0,1
4,5fd9cc2b08e6158003caa6c8,20220108,6,2.567222,0.0,1.548611,1.018611,1


In [284]:
df_tmp = df_login_hours.drop(['yyyymmdd','day_of_week','week'], axis = 1)
df_tmp['dummy'] = 1
df_tmp = df_tmp.groupby(['rider','dummy']).agg('sum').reset_index()
#df_tmp = df_tmp.merge(df_rides[['captain_id']].drop_duplicates(), how = 'inner', left_on = 'rider', right_on = 'captain_id').drop('captain_id', axis=1)
df_tmp.groupby('dummy').apply(lambda x : x.describe([0.50,0.75,0.95])).drop('dummy', axis=1).reset_index()
#df_tmp[df_tmp.level_1.isin(['mean','50%', '75%', '95%'])].pivot(index='dummy', columns='level_1')

Unnamed: 0,dummy,level_1,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION
0,1,count,96181.0,96181.0,96181.0,96181.0
1,1,mean,36.458212,9.98597,15.307875,9.763828
2,1,std,53.827946,18.302317,23.965573,16.165745
3,1,min,0.000278,0.0,0.0,0.0
4,1,50%,10.396944,1.113611,3.425,2.365
5,1,75%,50.775833,10.878611,20.378056,12.285833
6,1,95%,157.468333,51.918889,70.483611,45.050833
7,1,max,523.053611,164.241667,160.065,150.688056


In [155]:
df_r = df_rides[['captain_id','hour','order_date','total_orders']]
df_r['time_bucket'] = df_r['hour'].apply(lambda x : 'morning' if x >= 6 and x <= 11
                                     else 'afternoon' if x >= 12 and x <= 17
                                     else 'evening' if x >= 18 and x <= 23
                                     else 'others')
df_r = df_r.groupby(['captain_id','order_date','time_bucket']).agg('sum').drop('hour', axis = 1).reset_index()
#df_r.pivot_table(index=['captain_id','order_date'], columns = 'time_bucket', values = 'total_orders', aggfunc = 'sum').reset_index()
df_r['order_date'] = df_r['order_date'].replace("-","")

df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_login_hours.drop(['day_of_week','week'], axis = 1).rename(columns={"rider":"RIDER"}), how = 'inner', on = 'RIDER')

#Morning
df_tmp = df_tmp.merge(df_r[df_r['time_bucket']=='morning'].rename(columns={"captain_id":"RIDER","order_date":"yyyymmdd","time_bucket":"morning_time_bucket"}), 
                              how = 'inner', on = ['RIDER','yyyymmdd']).drop(['total_orders'], axis=1).merge(
                    df_r[df_r['time_bucket']=='afternoon'].rename(columns={"captain_id":"RIDER","order_date":"yyyymmdd","time_bucket":"afternoon_time_bucket"}), 
                              how = 'inner', on = ['RIDER','yyyymmdd']).drop(['total_orders'], axis=1).merge(
                    df_r[df_r['time_bucket']=='evening'].rename(columns={"captain_id":"RIDER","order_date":"yyyymmdd","time_bucket":"evening_time_bucket"}), 
                              how = 'inner', on = ['RIDER','yyyymmdd']).drop(['total_orders'], axis=1)

df_tmp['MORNING_DURATION'] = df_tmp.apply(lambda x : 0 if pd.isna(x.morning_time_bucket) else x.MORNING_DURATION, axis = 1)
df_tmp['AFTERNOON_DURATION'] = df_tmp.apply(lambda x : 0 if pd.isna(x.afternoon_time_bucket) else x.AFTERNOON_DURATION, axis = 1)
df_tmp['EVENING_DURATION'] = df_tmp.apply(lambda x : 0 if pd.isna(x.evening_time_bucket) else x.EVENING_DURATION, axis = 1)
df_tmp.drop(['morning_time_bucket','afternoon_time_bucket','evening_time_bucket'], axis = 1, inplace=True)
df_tmp

Unnamed: 0,RIDER,Segment,recency,yyyymmdd,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION
0,5b4a15e3e6982857b8ba24ef,MP_D_MO,RECENT,20220105,8.733889,0.585000,4.095278,4.053611
1,5b4a15e3e6982857b8ba24ef,MP_D_MO,RECENT,20220120,7.988333,1.030000,4.514722,2.443611
2,5b4a15e3e6982857b8ba24ef,MP_D_MO,RECENT,20220122,8.363889,0.326111,4.336667,3.701111
3,5b4a15e3e6982857b8ba24ef,MP_D_MO,RECENT,20220125,3.807500,0.281389,2.885556,0.640556
4,5b4a15e3e6982857b8ba24ef,MP_D_MO,RECENT,20220126,6.323889,0.984722,3.118611,2.123056
...,...,...,...,...,...,...,...,...
28907,61cccee29ba37216482e3ed2,MP_D_HO,RECENT,20220129,7.131667,2.742500,1.611111,2.778056
28908,61deedc1d80f2b3194a5e064,MP_D_HO,RECENT,20220127,16.538056,2.841111,4.800833,6.847222
28909,61e57d357fb84ea9b66673f3,MP_D_LO,RECENT,20220126,5.174722,1.932222,1.609444,1.633056
28910,61ebf585ebef82d0c1228bd9,LP_D_LO,RECENT,20220124,5.682778,0.863889,3.243056,1.569167


In [156]:
# Time Bucket login affinity

df_tmp = df_tmp.drop(['yyyymmdd','recency'], axis = 1)
df_tmp = df_tmp.groupby(['RIDER','Segment']).agg('sum').reset_index()
df_tmp['total'] = df_tmp['MORNING_DURATION'].fillna(0) + df_tmp['AFTERNOON_DURATION'].fillna(0) + df_tmp['EVENING_DURATION'].fillna(0)
df_tmp['time_bucket_affinity'] = df_tmp.apply(lambda x : 'night' if x.total == 0
                                                     else 'morning' if x.MORNING_DURATION/x.total > 0.67
                                                     else 'afternoon' if x.AFTERNOON_DURATION/x.total > 0.67
                                                     else 'evening' if x.EVENING_DURATION/x.total > 0.67
                                                     else 'morning_afternoon' if (x.MORNING_DURATION+x.AFTERNOON_DURATION)/x.total > 0.67 and np.argmax([x.MORNING_DURATION,x.AFTERNOON_DURATION,x.EVENING_DURATION]) in (0,1) and np.argmax([x.MORNING_DURATION,x.EVENING_DURATION]) == 0 and np.argmax([x.AFTERNOON_DURATION,x.EVENING_DURATION]) == 0
                                                     else 'afternoon_evening' if (x.AFTERNOON_DURATION+x.EVENING_DURATION)/x.total > 0.67 and np.argmax([x.MORNING_DURATION,x.AFTERNOON_DURATION,x.EVENING_DURATION]) in (1,2) and np.argmax([x.MORNING_DURATION,x.AFTERNOON_DURATION]) == 1 and np.argmax([x.MORNING_DURATION,x.EVENING_DURATION]) == 1
                                                     else 'morning_evening' if (x.MORNING_DURATION+x.EVENING_DURATION)/x.total > 0.67 and np.argmax([x.MORNING_DURATION,x.AFTERNOON_DURATION,x.EVENING_DURATION]) in (0,2) and np.argmax([x.MORNING_DURATION,x.AFTERNOON_DURATION]) == 0 and np.argmax([x.AFTERNOON_DURATION,x.EVENING_DURATION]) == 1
                                                     else 'full_day', axis=1)
df_tmp['time_bucket_affinity'].value_counts()

afternoon_evening    5401
morning_evening      2433
morning_afternoon    1193
evening               238
afternoon              93
morning                42
full_day                4
Name: time_bucket_affinity, dtype: int64

In [178]:
x = df_tmp.pivot_table(index='Segment', columns= 'time_bucket_affinity', values = 'RIDER', aggfunc = 'nunique', fill_value = 0)
x = x.div(x.sum(axis=1), axis=0)*100.0
#x.reset_index(inplace=True)
thresh = 50   #percent
x['max_value'] = x.max(axis = 1)
x['time_bucket_affinity'] = x.idxmax(axis = 1)
x['final_affinity'] = x.apply(lambda x : x.time_bucket_affinity if x.max_value > thresh
                                     else 'undecided', axis = 1)
x[['final_affinity']]

time_bucket_affinity,final_affinity
Segment,Unnamed: 1_level_1
HP_D_HO,afternoon_evening
HP_D_UHO,afternoon_evening
HP_Inter_HO,afternoon_evening
HP_Inter_UHO,afternoon_evening
HP_Intra_HO,afternoon_evening
HP_Intra_UHO,undecided
LP_D_HO,afternoon_evening
LP_D_LO,afternoon_evening
LP_D_MO,afternoon_evening
LP_D_UHO,afternoon_evening


In [143]:
# Time Bucket login unavailability

df_tmp = df_login_hours.drop(['yyyymmdd','day_of_week','week'], axis = 1)
df_tmp = df_tmp.groupby('rider').agg('sum').reset_index()
df_tmp['time_bucket_unavailability'] = df_tmp.apply(lambda x : 'full_day' if x.login_hours == 0
                                                     else 'morning_afternoon' if x.MORNING_DURATION == x.AFTERNOON_DURATION == 0
                                                     else 'afternoon_evening' if x.AFTERNOON_DURATION == x.EVENING_DURATION == 0
                                                     else 'morning_evening' if x.MORNING_DURATION == x.EVENING_DURATION == 0
                                                     else 'morning' if x.MORNING_DURATION == 0
                                                     else 'afternoon' if x.AFTERNOON_DURATION == 0
                                                     else 'evening', axis=1)
df_tmp['time_bucket_unavailability'].value_counts()

evening              62932
morning              11156
morning_afternoon     8764
morning_evening       6899
afternoon_evening     4124
afternoon             2306
Name: time_bucket_unavailability, dtype: int64

In [146]:
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_tmp[['rider','time_bucket_unavailability']].rename(columns={"rider":"RIDER"}), how = 'left', on = 'RIDER')
df_tmp.pivot_table(index='Segment', columns= 'time_bucket_unavailability', values = 'RIDER', aggfunc = 'nunique').reset

time_bucket_unavailability,afternoon,afternoon_evening,evening,morning,morning_afternoon,morning_evening
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HP_D_HO,2.0,7.0,1395.0,106.0,8.0,9.0
HP_D_UHO,1.0,,480.0,10.0,2.0,5.0
LP_D_HO,9.0,1.0,545.0,51.0,8.0,1.0
LP_D_LO,92.0,54.0,3067.0,817.0,204.0,76.0
LP_D_MO,46.0,9.0,2093.0,272.0,68.0,12.0
LP_D_UHO,2.0,,141.0,14.0,,
LP_Inter_HO,,,4.0,2.0,,
LP_Inter_LO,48.0,49.0,379.0,172.0,85.0,85.0
LP_Inter_MO,4.0,1.0,41.0,16.0,3.0,2.0
LP_Inter_UHO,,1.0,1.0,,,


# Weekday / Weekend Affinity

In [16]:
df_tmp = df_login_hours.drop(['week','MORNING_DURATION','AFTERNOON_DURATION','EVENING_DURATION'], axis = 1)
df_r = df_rides[['captain_id','order_date','total_orders']].groupby(['captain_id','order_date']).agg('sum').reset_index()
df_tmp = df_tmp.merge(df_r.rename(columns= {"captain_id":"rider","order_date":"yyyymmdd"}), how = 'inner', on = ['rider','yyyymmdd'])
df_tmp = df_tmp.drop(['yyyymmdd','total_orders'], axis=1).groupby(['rider','day_of_week']).agg('sum').reset_index()
df_tmp = df_tmp.pivot(index='rider', columns = 'day_of_week', values = 'login_hours').reset_index()
df_tmp['weekday_normal'] = (df_tmp[1].fillna(0)+df_tmp[2].fillna(0)+df_tmp[3].fillna(0)+df_tmp[4].fillna(0)+df_tmp[5].fillna(0))/5
df_tmp['weekend_normal'] = (df_tmp[6].fillna(0)+df_tmp[7].fillna(0))/2
df_tmp['weekly_affinity'] = df_tmp.apply(lambda x : 'weekday' if x.weekday_normal/(x.weekday_normal+x.weekend_normal) > 0.75
                                                          else 'weekend' if x.weekend_normal/(x.weekday_normal+x.weekend_normal) > 0.75
                                                          else 'equally_affine', axis=1)
df_tmp.head()

day_of_week,rider,1,2,3,4,5,6,7,weekday_normal,weekend_normal,weekly_affinity
0,573f290d9b0ffc2836775a02,1.532778,1.283333,0.632778,,,,1.36,0.689778,0.68,equally_affine
1,573f292f9b0ffc283677d10b,3.026389,,,,,,,0.605278,0.0,weekday
2,573f29349b0ffc283677d9c1,,2.778056,0.571111,,,,,0.669833,0.0,weekday
3,5769691fdc61a1a753f86007,1.3275,,,,,2.063333,10.549722,0.2655,6.306528,weekend
4,577a8c8c3b01c336155f1c3d,1.879167,,,,1.987778,,,0.773389,0.0,weekday


In [17]:
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_tmp[['rider','weekly_affinity']].rename(columns={"rider":"RIDER"}), how = 'left', on = 'RIDER')
df_tmp.pivot_table(index='Segment', columns= 'weekly_affinity', values = 'RIDER', aggfunc = 'nunique').reset_index()

weekly_affinity,Segment,equally_affine,weekday,weekend
0,HP_D_HO,1449.0,452.0,215.0
1,HP_D_UHO,533.0,124.0,62.0
2,LP_D_HO,606.0,242.0,188.0
3,LP_D_LO,2996.0,2984.0,1283.0
4,LP_D_MO,2012.0,1165.0,585.0
5,LP_D_UHO,264.0,60.0,55.0
6,LP_Inter_HO,12.0,38.0,17.0
7,LP_Inter_LO,231.0,1578.0,526.0
8,LP_Inter_MO,29.0,150.0,45.0
9,LP_Inter_UHO,1.0,18.0,4.0


In [225]:
df_tmp = df_login_hours.drop(['week','login_hours','MORNING_DURATION','AFTERNOON_DURATION','EVENING_DURATION'], axis = 1)
df_tmp = df_tmp.groupby(['rider','day_of_week']).agg('count').reset_index()
df_tmp = df_tmp.pivot(index='rider', columns = 'day_of_week', values = 'yyyymmdd').reset_index()
# x['weekday_normal'] = (x[1].fillna(0)+x[2].fillna(0)+x[3].fillna(0)+x[4].fillna(0)+x[5].fillna(0))/5
# x['weekend_normal'] = (x[6].fillna(0)+x[7].fillna(0))/2
df_tmp['weekly_unavailability'] = df_tmp.apply(lambda x : 'weekday' if pd.isna(x[1]) and pd.isna(x[2]) and pd.isna(x[3]) and pd.isna(x[4]) and pd.isna(x[5])
                                                          else 'weekend' if pd.isna(x[6]) and pd.isna(x[7])
                                                          else 'full_week_unavailable' if pd.isna(x[1]) and pd.isna(x[2]) and pd.isna(x[3]) and pd.isna(x[4]) and pd.isna(x[5]) and pd.isna(x[6]) and pd.isna(x[7])
                                                          else 'full_week_available', axis=1)
df_tmp.head()

day_of_week,rider,1,2,3,4,5,6,7,weekly_unavailability
0,573f290a9b0ffc28367746af,,,1.0,,,,,weekend
1,573f290d9b0ffc2836775a02,1.0,1.0,1.0,,,,1.0,full_week_available
2,573f292f9b0ffc283677d10b,2.0,1.0,1.0,,,,,weekend
3,573f29349b0ffc283677d9c1,,1.0,1.0,,,,,weekend
4,573f29359b0ffc283677df3a,,,,,,,1.0,weekday


In [226]:
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_tmp[['rider','weekly_unavailability']].rename(columns={"rider":"RIDER"}), how = 'left', on = 'RIDER')
df_tmp.pivot_table(index='Segment', columns= 'weekly_unavailability', values = 'RIDER', aggfunc = 'nunique').reset_index()

weekly_unavailability,Segment,full_week_available,weekday,weekend
0,HP_D_HO,1828.0,98.0,282.0
1,HP_D_UHO,660.0,32.0,77.0
2,LP_D_HO,932.0,48.0,123.0
3,LP_D_LO,5631.0,365.0,1594.0
4,LP_D_MO,3283.0,143.0,503.0
5,LP_D_UHO,357.0,14.0,19.0
6,LP_Inter_HO,57.0,8.0,18.0
7,LP_Inter_LO,1428.0,226.0,1007.0
8,LP_Inter_MO,151.0,19.0,95.0
9,LP_Inter_UHO,17.0,,11.0


# Loyalty

In [275]:
df_ltr.describe([0.3,0.5,0.70,0.95])

Unnamed: 0,days_on_platform,ltr
count,432632.0,432632.0
mean,596.255157,96.773542
std,341.860333,238.980784
min,1.0,0.0
30%,369.0,3.0
50%,713.0,19.0
70%,828.0,66.0
95%,1075.0,435.0
max,1920.0,14887.0


In [207]:
df_tmp = df_segment[['RIDER','Segment']].merge(rf_segment.drop('week', axis=1).rename(columns = {"captainid":"RIDER"}), how = 'left', on = 'RIDER').merge(
                                                df_ltr.rename(columns={"captainId":"RIDER"}), how = 'left', on = 'RIDER')

p1 = np.percentile(df_ltr['ltr'], 80)
p2 = np.percentile(df_ltr['ltr'], 50)

df_tmp['loyalty'] = df_tmp['ltr'].apply(lambda x : 'long_timer' if x > p1
                                                   else 'seasoned' if x > p2
                                                   else 'newbie')
df_tmp['loyalty'].value_counts()

long_timer    17359
seasoned      15109
newbie         8213
Name: loyalty, dtype: int64

In [208]:
df_tmp.pivot_table(index='Segment', columns= 'loyalty', values = 'RIDER', aggfunc = 'nunique').reset_index()

loyalty,Segment,long_timer,newbie,seasoned
0,HP_D_HO,1427.0,30.0,361.0
1,HP_D_UHO,543.0,1.0,69.0
2,HP_Inter_HO,81.0,91.0,127.0
3,HP_Inter_UHO,28.0,30.0,34.0
4,HP_Intra_HO,171.0,4.0,96.0
5,HP_Intra_UHO,22.0,,17.0
6,LP_D_HO,315.0,95.0,275.0
7,LP_D_LO,1688.0,828.0,1579.0
8,LP_D_MO,1111.0,348.0,979.0
9,LP_D_UHO,97.0,32.0,57.0


In [201]:
df_ltr

Unnamed: 0,captainId,registrationdate,activationdate,days_on_platform,ltr,preferredtimebucket,prefferedday
0,5bfe710fd76a02356e923fb7,2018-11-28,2018-11-30,1169,3,EVENING,undecided
1,609e8774b3eb983d77a734dd,2021-11-14,2021-11-14,89,11,EVENING,weekend
2,5cdfe31925ee3218d4cde492,2020-12-05,2020-12-08,430,7,EVENING,weekend
3,5d68c6a2a24c96105e2da230,2021-08-27,2021-09-16,148,149,AFTERNOON,weekend
4,5d3dcbf93860423ec785bbb8,2019-07-28,2022-01-23,19,0,MORNING,undecided
...,...,...,...,...,...,...,...
432627,5e5a1f1acf3bb61ddee79649,2020-02-29,2020-02-29,713,0,MORNING,undecided
432628,5c70a06a5e042733c9bc4832,2019-02-23,2021-05-21,266,21,EVENING,undecided
432629,5e6252e968c265c3a6af99de,2020-08-23,2020-08-28,532,136,AFTERNOON,weekday
432630,5c7f43d48c352421eadb8a3c,2019-06-23,2019-07-01,956,0,MORNING,undecided


In [None]:
def create_weights(df):
    df[df.columns[0]] =  1/df[df.columns[0]]
    df["temp_total"] = df[df.columns[0]]
    for i in range(1, len(df.columns)-1):
        df[df.columns[i]] = 1/df[df.columns[i]]
        df["temp_total"] += df[df.columns[i]]
    for i in range(0, len(df.columns)):
        df[df.columns[i]] = df[df.columns[i]] / df["temp_total"]  
    df = df.drop(columns = ["temp_total"])
    return df

In [223]:
df_tmp = df_rides[['captain_id','order_date','link_orders','delivery_orders','total_orders']]
df_tmp['other_orders'] = df_tmp['total_orders']-df_tmp['link_orders']-df_tmp['delivery_orders']
df_tmp = df_tmp.groupby(['captain_id','order_date']).agg('sum').reset_index().groupby('captain_id').agg('sum').reset_index()
df_weights = create_weights(pd.DataFrame([[df_tmp['link_orders'].sum(), df_tmp['delivery_orders'].sum(), df_tmp['other_orders'].sum()]], columns = ['link','delivery','others']))
df_weights

Unnamed: 0,link,delivery,others
0,0.017683,0.110113,0.872204


In [238]:
df_tmp = df_rides[['captain_id','order_date','link_orders','delivery_orders','total_orders']]
df_tmp['other_orders'] = df_tmp['total_orders']-df_tmp['link_orders']-df_tmp['delivery_orders']
df_tmp = df_tmp.groupby(['captain_id']).agg({'link_orders':'sum', 'delivery_orders':'sum', 'other_orders':'sum','order_date':'nunique'}).reset_index()
df_tmp['link_orders'] = df_tmp['link_orders']*df_weights.loc[0,'link']/df_tmp['order_date']
df_tmp['delivery_orders'] = df_tmp['delivery_orders']*df_weights.loc[0,'delivery']/df_tmp['order_date']
df_tmp['other_orders'] = df_tmp['other_orders']*df_weights.loc[0,'others']/df_tmp['order_date']
df_tmp['affinity_percent'] = df_tmp.apply(lambda x : max(x['link_orders'], x['delivery_orders'], x['other_orders'])/(x['link_orders']+x['delivery_orders']+x['other_orders'])*100, axis=1)
threshold = np.percentile(df_tmp['affinity_percent'], 75)
df_tmp

Unnamed: 0,captain_id,link_orders,delivery_orders,other_orders,order_date,affinity_percent
0,573f290d9b0ffc2836775a02,0.022103,0.0,0.0,4,100.0
1,573f292f9b0ffc283677d10b,0.070731,0.0,0.0,1,100.0
2,573f29349b0ffc283677d9c1,0.061889,0.0,0.0,2,100.0
3,577a8c8c3b01c336155f1c3d,0.035365,0.0,0.0,2,100.0
4,5796702bd1810d961888b1bd,0.053048,0.0,0.0,3,100.0
...,...,...,...,...,...,...
39604,61f64451ebef8210e429217a,0.017683,0.0,0.0,1,100.0
39605,61f6483aebef8275ce29246f,0.017683,0.0,0.0,1,100.0
39606,61f65b5bd0c48f4c6c4b8fc7,0.053048,0.0,0.0,1,100.0
39607,61f661336916b8d10faf0b5a,0.017683,0.0,0.0,1,100.0


In [241]:
df_tmp['affinity_percent'].describe([0.3, 0.4, 0.5, 0.75, 0.8])

count    39609.000000
mean        92.742749
std         13.703721
min         35.380114
30%        100.000000
40%        100.000000
50%        100.000000
75%        100.000000
80%        100.000000
max        100.000000
Name: affinity_percent, dtype: float64

Unnamed: 0,captain_id,link_orders,delivery_orders,other_orders,order_date,affinity_percent
14,584d2e3bd543f0ca39ac0429,0.047154,0.018352,0.000000,18,71.984007
15,5850a40cd543f0ca39ac1ca0,0.035365,0.110113,0.000000,2,75.690273
17,5850c0ccd543f0ca39ac1e7c,0.077804,0.000000,0.174441,5,69.155427
21,5853a1207fd3046c0efc2ef4,0.177711,0.005506,0.000000,20,96.995008
22,5853c80a7fd3046c0efc30c5,0.036839,0.100937,0.000000,12,73.261676
...,...,...,...,...,...,...
39492,61f29878ebef8239e726ec1d,0.123779,0.082585,0.000000,4,59.980963
39512,61f2d6d0ebef822fad271a9d,0.079572,0.055056,0.000000,2,59.104918
39514,61f2da516916b8134bacf21b,0.053048,0.110113,0.000000,2,67.487285
39541,61f3c29d93f6aa47b9f0d0f3,0.135567,0.256930,0.000000,3,65.460339


3

In [193]:
df_tmp = df_pings[df_pings['captain_id']!=''].groupby('captain_id').agg('sum').reset_index()
df_tmp['link_conversion'] = round(df_tmp['dropped_orders_link']/df_tmp['True_total_pings_link']*100,2)
df_tmp['delivery_conversion'] = round(df_tmp['dropped_orders_delivery']/df_tmp['True_total_pings_delivery']*100,2)
df_tmp['cu_affinity'] = df_tmp.apply(lambda x : 'link' if pd.isna(x.delivery_conversion)
                                            else 'delivery' if pd.isna(x.link_conversion)
                                            else 'link' if x.link_conversion > 75
                                            else 'delivery' if x.delivery_conversion > 75
                                            else 'cu', axis = 1)
df_tmp[(df_tmp['dropped_orders_link']>0) & (df_tmp['dropped_orders_delivery']>0)]

Unnamed: 0,captain_id,Total_Pings,True_total_pings,accepted_pings,dropped_orders,Total_Pings_Link,True_total_pings_link,accepted_pings_link,dropped_orders_link,Total_Pings_Delivery,True_total_pings_delivery,accepted_pings_delivery,dropped_orders_delivery,link_conversion,delivery_conversion,cu_affinity
24,584d2e3bd543f0ca39ac0429,1021,902,101,54,466,391,98,51,555,511,3,3,13.04,0.59,cu
28,5850a40cd543f0ca39ac1ca0,20,18,7,6,12,10,5,4,8,8,2,2,40.00,25.00,cu
37,5853a1207fd3046c0efc2ef4,385,300,321,203,383,298,320,202,2,2,1,1,67.79,50.00,cu
38,5853c80a7fd3046c0efc30c5,81,74,51,36,60,53,40,25,21,21,11,11,47.17,52.38,cu
47,585a607008b1b926573edeaa,479,422,241,184,173,133,138,81,301,284,99,99,60.90,34.86,cu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52941,61f29878ebef8239e726ec1d,44,39,37,31,40,35,34,28,4,4,3,3,80.00,75.00,link
52967,61f2d6d0ebef822fad271a9d,40,37,14,10,29,27,13,9,11,10,1,1,33.33,10.00,cu
52969,61f2da516916b8134bacf21b,31,30,9,8,22,21,7,6,9,9,2,2,28.57,22.22,cu
53013,61f3c29d93f6aa47b9f0d0f3,38,36,34,30,29,27,27,23,9,9,7,7,85.19,77.78,link


Unnamed: 0,city,time_period,link,delivery,zomato
0,Hyderabad,morning,1,2,3
1,Hyderabad,afternoon,3,2,1


'city'

In [147]:
df_captain = df_segment.merge(rf_segment.rename(columns={"captainid":"RIDER"}), how = 'left', on = ['RIDER','week'])

In [148]:
df_captain.pivot_table(index='Segment', columns = 'recency', values = 'RIDER', aggfunc = 'nunique')

recency,DORMANT,INACTIVE,RECENT
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HP_D_HO,542.0,207.0,1529.0
HP_D_UHO,205.0,95.0,499.0
LP_D_HO,350.0,141.0,616.0
LP_D_LO,2578.0,770.0,4313.0
LP_D_MO,1153.0,338.0,2502.0
LP_D_UHO,127.0,107.0,158.0
LP_Inter_HO,48.0,33.0,6.0
LP_Inter_LO,1361.0,630.0,819.0
LP_Inter_MO,142.0,70.0,67.0
LP_Inter_UHO,18.0,8.0,2.0


In [10]:
df_captain = df_captain[['RIDER','Segment','city','week','recency']]

In [11]:
df_final = df_captain[['RIDER','recency','Segment']].merge(df_rides.drop('week', axis=1), 
                                                            how = 'left', 
                                                            left_on = 'RIDER',
                                                            right_on = 'captain_id').merge(
                                                                                df_login_hours.drop('week',axis=1), 
                                                                                how = 'left', 
                                                                                left_on = ['RIDER','order_date'],
                                                                                right_on = ['rider','yyyymmdd'])
df_final.head()

Unnamed: 0,RIDER,recency,Segment,captain_id,order_date,link_orders,delivery_orders,total_orders,rider,yyyymmdd,day_of_week,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION
0,5a12c3feeb731533e3d07a48,RECENT,HP_D_HO,,,,,,,,,,,,
1,5b1d7c19c674183a8722e139,RECENT,LP_D_MO,5b1d7c19c674183a8722e139,20220126.0,2.0,0.0,2.0,5b1d7c19c674183a8722e139,20220126.0,3.0,1.848889,0.0,0.0,1.848889
2,5b1d7c19c674183a8722e139,RECENT,LP_D_MO,5b1d7c19c674183a8722e139,20220130.0,10.0,0.0,10.0,5b1d7c19c674183a8722e139,20220130.0,7.0,7.839722,0.0,4.211111,3.628611
3,5b4224e76198ea7caab5a1bc,RECENT,LP_D_LO,5b4224e76198ea7caab5a1bc,20220129.0,4.0,0.0,4.0,5b4224e76198ea7caab5a1bc,20220129.0,6.0,3.535278,0.0,2.39,1.145278
4,5b4224e76198ea7caab5a1bc,RECENT,LP_D_LO,5b4224e76198ea7caab5a1bc,20220127.0,2.0,0.0,2.0,5b4224e76198ea7caab5a1bc,20220127.0,4.0,1.760833,0.033889,1.379167,0.347778


In [15]:
df_final.pivot_table(index='Segment', columns = 'day_of_week', values = 'RIDER', aggfunc = 'nunique')

day_of_week,1.0,2.0,3.0,4.0,5.0,6.0,7.0
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HP_D_HO,42.0,55.0,50.0,58.0,64.0,77.0,72.0
HP_D_UHO,18.0,23.0,22.0,17.0,27.0,26.0,31.0
LP_D_HO,67.0,80.0,80.0,58.0,64.0,99.0,99.0
LP_D_LO,221.0,230.0,288.0,214.0,211.0,277.0,293.0
LP_D_MO,155.0,146.0,180.0,125.0,169.0,149.0,202.0
LP_D_UHO,51.0,58.0,55.0,44.0,47.0,59.0,75.0
LP_Inter_LO,4.0,1.0,5.0,1.0,,4.0,3.0
LP_Inter_MO,,,,1.0,,,
LP_Inter_UHO,1.0,,,,,,
LP_Intra_HO,1.0,1.0,1.0,1.0,4.0,,1.0


In [24]:
df_tmp = df_final[df_final['recency']=='RECENT'].groupby(['RIDER','recency','Segment'])['login_hours','MORNING_DURATION','AFTERNOON_DURATION','EVENING_DURATION'].agg('sum').reset_index()
df_tmp['morning_pct'] = df_tmp['MORNING_DURATION'].fillna(0)/df_tmp['login_hours']*100
df_tmp['afternoon_pct'] = df_tmp['AFTERNOON_DURATION'].fillna(0)/df_tmp['login_hours']*100
df_tmp['evening_pct'] = df_tmp['EVENING_DURATION'].fillna(0)/df_tmp['login_hours']*100
df_tmp['time_bucket_affinity'] = df_tmp.apply(lambda x : 'morning' if x.morning_pct > (x.afternoon_pct+x.evening_pct)
                                                        else 'afternoon' if x.afternoon_pct > (x.morning_pct+x.evening_pct)
                                                         else 'evening' if x.evening_pct > (x.morning_pct+x.afternoon_pct)
                                                         else 'undecided', axis = 1)
df_tmp

Unnamed: 0,RIDER,recency,Segment,login_hours,MORNING_DURATION,AFTERNOON_DURATION,EVENING_DURATION,morning_pct,afternoon_pct,evening_pct,time_bucket_affinity
0,573f29349b0ffc283677d9c1,RECENT,MP_D_LO,2.778056,0.037778,0.000000,2.740278,1.359864,0.000000,98.640136,evening
1,57c9bc24ef0afc1a264e8a12,RECENT,MP_D_HO,5.524167,2.246389,3.277778,0.000000,40.664756,59.335244,0.000000,afternoon
2,57cb0546b43bd2ee62b3a57b,RECENT,LP_D_MO,8.205000,2.213333,5.991667,0.000000,26.975421,73.024579,0.000000,afternoon
3,57d797e935b0b6ed4cdf4677,RECENT,MP_D_MO,22.815833,4.358611,15.536111,2.921111,19.103449,68.093551,12.803000,afternoon
4,584d2e3bd543f0ca39ac0429,RECENT,LP_D_UHO,7.974167,0.451944,0.000000,0.768889,5.667607,0.000000,9.642248,evening
...,...,...,...,...,...,...,...,...,...,...,...
19807,61f64451ebef8210e429217a,RECENT,LP_Inter_LO,0.096389,0.000000,0.096389,0.000000,0.000000,100.000000,0.000000,afternoon
19808,61f6483aebef8275ce29246f,RECENT,LP_Intra_UHO,1.820278,0.000000,0.182778,1.637500,0.000000,10.041203,89.958797,evening
19809,61f65b5bd0c48f4c6c4b8fc7,RECENT,MP_D_HO,3.357500,0.000000,1.010000,2.347500,0.000000,30.081906,69.918094,evening
19810,61f661336916b8d10faf0b5a,RECENT,LP_Inter_LO,0.418889,0.000000,0.418889,0.000000,0.000000,100.000000,0.000000,afternoon


In [29]:
df_tmp.pivot_table(index='Segment', columns='time_bucket_affinity', values = 'RIDER', aggfunc = 'nunique')

time_bucket_affinity,afternoon,evening,morning,undecided
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HP_D_HO,620.0,246.0,200.0,463.0
HP_D_UHO,128.0,65.0,51.0,255.0
LP_D_HO,176.0,218.0,70.0,152.0
LP_D_LO,1578.0,1550.0,630.0,555.0
LP_D_MO,839.0,867.0,334.0,462.0
LP_D_UHO,37.0,63.0,15.0,43.0
LP_Inter_HO,3.0,,1.0,2.0
LP_Inter_LO,261.0,231.0,163.0,164.0
LP_Inter_MO,19.0,17.0,12.0,19.0
LP_Inter_UHO,1.0,,1.0,
