<a href="https://colab.research.google.com/github/makingthefuturehappy/bonus/blob/main/bonus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<H1>Bonus</H1>

Drivers' segments description
https://docs.google.com/spreadsheets/d/11KIaZaywoBq3MymrCd8dmWJdBhFO989SNsrrSKYiB3Y/edit#gid=0

**Настраиваем переменные расчета**

In [139]:
path = "testdataset.csv"
cohorts = 5           # quantity of cohorts
commission = 0.095    # indriver commission
win_rate = 0.15       # percent of drivers who moves to the upper cohort
period_duration = 0.5 # period to forecast prospective rides from the total period

## Загружаем данные и анализируем кол-во поездок для каждого водителя

In [140]:
import pandas as pd
import numpy as np
import math
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

In [141]:
# percentile level calc
percentile_level = {}
percentile = 0
for i in range (1, cohorts):
    percentile = 100 / cohorts + percentile
    percentile_level[i] = int(percentile)

<h3>Data upload and pre-calc</h3>

In [142]:
df = pd.read_csv(path)
df['gmv'] = pd.to_numeric(df['gmv'], errors='coerce')
dates = list(df.columns)[2:]
print("total drivers:", df.shape[0])
df.sample(n=5)

total drivers: 19687


Unnamed: 0,id,gmv,28.04.22,29.04.22,30.04.22,01.05.22,02.05.22,03.05.22,04.05.22,05.05.22,...,16.05.22,17.05.22,18.05.22,19.05.22,20.05.22,21.05.22,22.05.22,23.05.22,24.05.22,25.05.22
16174,50621028,135.0,4,2,0,0,0,1,2,4,...,0,0,0,0,0,0,0,2,0,0
3017,38377040,235.0,12,11,0,0,0,7,9,13,...,0,8,2,0,0,0,0,0,0,0
16188,71822633,11.0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
11559,14863312,199.0,9,10,9,0,0,1,1,1,...,0,1,1,2,1,2,0,0,1,1
6959,30387397,79.0,0,0,0,0,5,3,0,2,...,0,0,1,0,0,0,0,0,0,0


<h3>retention & avr. check calc</h3>

In [143]:
# date split by weekends and workdays 
fridays = []
saturdays = []
sundays = []
workdays = []

dates = list(filter(lambda x: "." in x, df.columns))
for weekday in dates:
    daynum = datetime.strptime(weekday, '%d.%m.%y').isoweekday()
    if daynum == 5: fridays.append(weekday)
    elif daynum == 6: saturdays.append(weekday)
    elif daynum == 7: sundays.append(weekday)
    else: workdays.append(weekday)

In [144]:
# split the period by 2 equal "dates" parts 
if len(dates)%2 == 0:
    index_2nd = len(dates)//2
    index_1st = 0
else:
    index_2nd = len(dates)//2+1
    index_1st = 1
    
first_half = dates[index_1st : index_2nd]
second_half = dates[index_2nd : len(dates)]
total_days = len(first_half) + len(second_half)


# retention & avr_check cals
df['first_half'] = df[first_half].sum(axis=1)
df['second_half'] = df[second_half].sum(axis=1)

def retention_calc(first_half, second_half):
    try:
        retention = second_half / first_half
    except:
        retention = 0
    retention = min (1, retention)
    return retention

df['retention'] = df.apply(lambda x: retention_calc(x['first_half'], x['second_half']), axis =  1)
df['total_rides'] = df['first_half'] + df['second_half']
df['avr_check'] = df['gmv'] / df['total_rides']
df['WK_rides'] = df[fridays + saturdays + sundays].sum(axis=1) # weekends rides
df['WD_rides'] = df[workdays].sum(axis=1) # weekends rides


key_fields = ['id', 'gmv','first_half','second_half','total_rides', 'WD_rides','WK_rides', 'avr_check', 'retention'] 
 
df = df[np.isfinite(df).all(1)] # infinity values drop (comes from a division by zero)
df = df.round({'gmv': 0, 'retention': 2, 'avr_check': 1})

## Результат расчета кол-ва поездок


**id** - id водителя<br>
**gmv** - gmv водителя<br>
**first_half** - количество поездок в первую половину периода<br>
**second_half** - количество поездок в первую половину периода<br>
**total_rides** - всего поездок за период
**WD_rides** - количество поездок в WorkDays<br>
**WK_rides** - количество поездок в WeeKends<br>
**avr_check** - средний чек<br>



In [145]:
print('total drivers:', df.shape[0])
print('\nвыгрузка рандомных водителей')
df[key_fields].sample(n=5)

total drivers: 19177

выгрузка рандомных водителей


Unnamed: 0,id,gmv,first_half,second_half,total_rides,WD_rides,WK_rides,avr_check,retention
19573,25362133,7.0,2,0,2,2,0,3.5,0.0
9955,85575043,33.0,9,0,9,0,9,3.7,0.0
15625,20604143,342.0,54,45,99,69,30,3.5,0.83
3710,108142261,177.0,50,0,50,24,26,3.5,0.0
11325,111791909,14.0,4,0,4,0,4,3.5,0.0


## Расчитываем размеры когорт

In [146]:
# exclude IDs which have zero rides during the first half of the period
df_activer_drivers = df.loc[df['first_half'] != 0]
    
all_cohorts = {}
range_start = 0

for percentile in percentile_level.keys():
    
    max_value = np.percentile(np.array(df_activer_drivers.total_rides.tolist()), percentile_level[percentile]) # percentile value
    all_cohorts[percentile] = range(int(range_start), int(max_value))
    range_start = max_value

last_percentile = int(max(list(percentile_level.keys())))+1
all_cohorts[last_percentile] = range(int(max_value),
                                     int(1e8)) # set a large number to keep the cohort range            

# cohort distribution for all IDs
def cohort_check(x, all_cohorts):
    for cohort in all_cohorts.keys():
        if x in all_cohorts[cohort]:
            return cohort
        else:
            pass      
df['cohort'] = df.apply(lambda x: cohort_check(x['total_rides'], all_cohorts), axis =  1)

In [147]:
# shuffle data within cohorts
def shuffle(df,      # dataframe to shuffle data 
            cohorts): # cohorts to shuffle data within
    shuffle_df = pd.DataFrame()
    for cohort in set(cohorts):
        shuffle_df = shuffle_df.append(df[df.cohort == cohort].sample(frac=1))
    return shuffle_df

df = shuffle(df, df['cohort'].tolist())
df.reset_index(drop=True, inplace = True)
df.sample(n=5)

Unnamed: 0,id,gmv,28.04.22,29.04.22,30.04.22,01.05.22,02.05.22,03.05.22,04.05.22,05.05.22,...,24.05.22,25.05.22,first_half,second_half,retention,total_rides,avr_check,WK_rides,WD_rides,cohort
17783,14593380,466.0,4,8,9,13,7,4,3,2,...,1,1,75,49,0.65,124,3.8,68,56,5
5879,16641508,94.0,0,0,1,0,0,0,0,1,...,1,0,12,14,1.0,26,3.6,21,5,2
13402,18463180,319.0,5,2,4,2,3,4,3,1,...,4,4,38,48,1.0,86,3.7,28,58,4
8352,64174555,79.0,0,1,6,4,1,0,0,2,...,0,0,18,4,0.22,22,3.6,17,5,2
1498,50040123,22.0,0,0,0,0,3,0,0,0,...,0,0,5,1,0.2,6,3.7,2,4,1


<H3>cohorts features calc</H3>

In [148]:
avr_check = {}
for cohort in range(1, cohorts+1):
    avr_check[cohort] = df['avr_check'].loc[df['cohort'] == cohort].mean()

avr_rides = {}
for cohort in range(1, cohorts+1):
    avr_rides[cohort] = df['total_rides'].loc[df['cohort'] == cohort].mean()
    
avr_wd_rides = {}
for cohort in range(1, cohorts+1):
    avr_wd_rides[cohort] = df['WD_rides'].loc[df['cohort'] == cohort].mean()
    
avr_wk_rides = {}
for cohort in range(1, cohorts+1):
    avr_wk_rides[cohort] = df['WK_rides'].loc[df['cohort'] == cohort].mean()

drivers = {}
for cohort in range(1, cohorts+1):
    drivers[cohort] = df['id'].loc[df['cohort'] == cohort].count()

avr_retention = {}
for cohort in range(1, cohorts+1):
    avr_retention[cohort] = df['retention'].loc[df['cohort'] == cohort].mean()

cohort_df = pd.DataFrame(list(avr_check.items()),
                   columns=['cohort', 'avr_check'])
cohort_df['drivers'] = list(drivers.values())
cohort_df['avr_rides'] = list(avr_rides.values())
cohort_df['WK_all_rides'] = list(avr_wk_rides.values()) # quantity of rides on weekends 
cohort_df['WD_all_rides'] = list(avr_wd_rides.values()) # quantity of rides on workdays 


# 'WK_avr_rides' - # avr. rides on a particular weekend
weekends_days = np.mean([len(fridays), len(saturdays), len(sundays)])
cohort_df['WK_avr_rides'] = cohort_df['WK_all_rides'] / weekends_days

# 'WD_avr_rides' - # avr. rides on a particular weekend
workdays_days = len(workdays)
cohort_df['WD_avr_rides'] = cohort_df['WD_all_rides'] / workdays_days

# retention calc
cohort_df = pd.merge(df.groupby('cohort')['retention'].mean(),
                     cohort_df,
                     on="cohort")

cohort_df['avr_rev_total'] = (cohort_df['avr_check'] * cohort_df['avr_rides'] * commission).astype(int)
cohort_df = cohort_df.round({'retention': 2,
                             'avr_check': 1,
                             'avr_rides': 1,
                             'WK_all_rides': 1,
                             'WD_all_rides': 1,
                             'WK_avr_rides': 1,
                             'WD_avr_rides': 1,
                            })

## Описание когорт 

In [149]:
print('\ncohorts and range of rides:')
all_cohorts


cohorts and range of rides:


{1: range(0, 10),
 2: range(10, 28),
 3: range(28, 57),
 4: range(57, 110),
 5: range(110, 100000000)}

In [150]:
print("описание когорт:")
cohort_df

описание когорт:


Unnamed: 0,cohort,retention,avr_check,drivers,avr_rides,WK_all_rides,WD_all_rides,WK_avr_rides,WD_avr_rides,avr_rev_total
0,1,0.57,3.7,4827,3.9,1.8,2.1,0.4,0.1,1
1,2,0.69,3.6,3819,17.5,8.4,9.1,2.1,0.6,5
2,3,0.77,3.6,3636,40.7,19.3,21.4,4.8,1.3,13
3,4,0.82,3.6,3481,80.4,38.0,42.5,9.5,2.7,27
4,5,0.88,3.6,3414,166.2,77.3,88.8,19.3,5.6,56


<h3>Target Rides and Incentive</h3>

In [151]:
# bonus to be given as an inventive by groups, % from average check of the cohort  
# 1 equals 100% from the GMV of the target rides
# example: gmv = $10, 0.35 = $3.5 as the bonus

bonuses = {           
    1: 0.35,
    2: 0.35,
    3: 0.35,
    4: 0.35,
    5: 0}

## Расчитываем цели и вознаграждения

In [152]:
# cohort_rides - maximum rides out of workdays & weekends rides
#cohort_df['cohort_rides'] = cohort_df[['WK_avr_rides', 'WD_avr_rides']].max(axis=1)

# rounding up all rides
cohort_df['cohort_rides'] = cohort_df['WK_avr_rides'].apply(lambda x: math.ceil(x))

# calc the target rides
new_rides = {}
for cohort in range(1, cohorts):
    new_rides[cohort] = cohort_df.loc[cohort,'cohort_rides'] - cohort_df.loc[cohort-1,'cohort_rides']

new_rides[cohorts] = 0 # set zero for the last upper cohort as there is no insentive

# 'WK_rides_extra' - Weekend Extra Rides to be done
cohort_df['WK_rides_extra'] = cohort_df['cohort'].map(new_rides) + 1
# add 1 additional ride to overachieve cohort border 
    
# 'rides_tbd' - all weekends rides to get the bonus
cohort_df['rides_tbd'] = cohort_df['cohort_rides'] + cohort_df['WK_rides_extra']
cohort_df.loc[cohorts-1,'rides_tbd'] = 0 # the target for the senior cohort shall be zero

# incentive for drivers
cohort_df['bonus, %'] = cohort_df['cohort'].map(bonuses)
cohort_df['driver_bonus'] = cohort_df['WK_rides_extra'] * cohort_df['avr_check'] * cohort_df['bonus, %']
cohort_df = cohort_df.round({'driver_bonus': 0})

In [153]:
cohort_df

Unnamed: 0,cohort,retention,avr_check,drivers,avr_rides,WK_all_rides,WD_all_rides,WK_avr_rides,WD_avr_rides,avr_rev_total,cohort_rides,WK_rides_extra,rides_tbd,"bonus, %",driver_bonus
0,1,0.57,3.7,4827,3.9,1.8,2.1,0.4,0.1,1,1,3,4,0.35,4.0
1,2,0.69,3.6,3819,17.5,8.4,9.1,2.1,0.6,5,3,3,6,0.35,4.0
2,3,0.77,3.6,3636,40.7,19.3,21.4,4.8,1.3,13,5,6,11,0.35,8.0
3,4,0.82,3.6,3481,80.4,38.0,42.5,9.5,2.7,27,10,11,21,0.35,14.0
4,5,0.88,3.6,3414,166.2,77.3,88.8,19.3,5.6,56,20,1,0,0.0,0.0


## Bonus adjustments (if required)
 

pre-calc bonuses based on % of the Average Check<br>

In [154]:
cohort_df[['cohort', 'avr_check', 'bonus, %', 'driver_bonus']]

Unnamed: 0,cohort,avr_check,"bonus, %",driver_bonus
0,1,3.7,0.35,4.0
1,2,3.6,0.35,4.0
2,3,3.6,0.35,8.0
3,4,3.6,0.35,14.0
4,5,3.6,0.0,0.0


below set new bonus for each cohort (absolut value in USD)

In [155]:
driver_bonus = {           
    1: 5.0,
    2: 4.0,
    3: 8.0,
    4: 17.0,
    5: 0.0}

cohort_df['driver_bonus'] = cohort_df['cohort'].map(driver_bonus)

## Результат расчетов

**WD_avr_rides**    - среднее кол-во поездок в workdays<br> 
**WK_avr_rides**    - среднее кол-во поездок в weekends<br>
**cohort_rides**    - среднее кол-во поездок в когорте<br>
**WK_rides_extra**  - на сколько больше надо сделать поездок в выходные<br>
**rides_tbd**       - сколько в итоге надо сделать поездок в выходые всего<br>
**driver_bonus**    - сколько бонусов предлагаем водителю за выполнение цели<br>
**avr_check**       - средний чек водителя (для сравнения)<br>

In [156]:
cohort_df[['cohort',
           'WD_avr_rides',
           'WK_avr_rides',
           'cohort_rides',
           'WK_rides_extra',
           'rides_tbd',
           'driver_bonus',
           'avr_check']]

Unnamed: 0,cohort,WD_avr_rides,WK_avr_rides,cohort_rides,WK_rides_extra,rides_tbd,driver_bonus,avr_check
0,1,0.1,0.4,1,3,4,5.0,3.7
1,2,0.6,2.1,3,3,6,4.0,3.6
2,3,1.3,4.8,5,6,11,8.0,3.6
3,4,2.7,9.5,10,11,21,17.0,3.6
4,5,5.6,19.3,20,1,0,0.0,3.6


## Расчитываем окупаемость

<h3>Instant revenue and costs</h3>

In [157]:
# instant revenue
cohort_df['win_rate'] = win_rate

# costs
cohort_df['costs'] = cohort_df['driver_bonus'] * \
    cohort_df['drivers'] * win_rate

# extra revenue for the company
cohort_df['ID_extra_rev'] = cohort_df['WK_rides_extra'] \
    * cohort_df['avr_check'] \
    * cohort_df['drivers'] \
    * win_rate \
    * commission

instant_rev = cohort_df['ID_extra_rev'].sum()
instant_cost = cohort_df['costs'].sum()

print("instant revenue:", int(instant_rev))
print("instant costs  :", int(instant_cost))
print("instant balance:", int(instant_rev - instant_cost))

instant revenue: 4609
instant costs  : 19151
instant balance: -14541


<h2>Perspective revenue projection</h2>

<h3>Target is not fullfilled</h3>

In [158]:
# target IS NOT fulfilled
# 'P: not FF: rev' - Perspective Not Fulfilled: Revenue

cohort_df['P: not FF: rev'] = cohort_df['avr_rides'] \
                                * cohort_df['avr_check'] \
                                * cohort_df['retention'] \
                                * period_duration \
                                * commission
cohort_df.loc[cohorts-1,'P: not FF: rev'] = 0 #keep the last cohort zero as no an insentive for the cohort

<h3>Target is fullfilled</h3>

In [159]:

# target IS fulfilled
FF_rides = {}
for i in range (1, cohorts):
    FF_rides[i] = avr_rides[i+1] * period_duration
FF_rides[cohorts] = 0
cohort_df['P: FF: rides'] = list(FF_rides.values())

FF_gmv = {}
for i in range (1, cohorts):
    FF_gmv[i] = int(FF_rides[i] * avr_check[i+1])
FF_gmv[cohorts] = 0
cohort_df['P: FF: gmv'] = list(FF_gmv.values())

FF_rev = {}
for i in range (1, cohorts):
    FF_rev[i] = int(FF_gmv[i] * commission * avr_retention[i+1])
FF_rev[cohorts] = 0
cohort_df['P: FF: rev'] = list(FF_rev.values())

cohort_df['P: rev_up'] = (cohort_df['P: FF: rev'] - cohort_df['P: not FF: rev']) \
                                * win_rate \
                                * cohort_df['drivers']

persp_rev_up = (cohort_df['P: rev_up'].sum())
revenue_total = cohort_df['ID_extra_rev'].sum() + persp_rev_up

In [160]:
cohort_df

Unnamed: 0,cohort,retention,avr_check,drivers,avr_rides,WK_all_rides,WD_all_rides,WK_avr_rides,WD_avr_rides,avr_rev_total,...,"bonus, %",driver_bonus,win_rate,costs,ID_extra_rev,P: not FF: rev,P: FF: rides,P: FF: gmv,P: FF: rev,P: rev_up
0,1,0.57,3.7,4827,3.9,1.8,2.1,0.4,0.1,1,...,0.35,5.0,0.15,3620.25,763.510725,0.390692,8.745614,31,2,1165.219276
1,2,0.69,3.6,3819,17.5,8.4,9.1,2.1,0.6,5,...,0.35,4.0,0.15,2291.4,587.7441,2.064825,20.359323,72,5,1681.414999
2,3,0.77,3.6,3636,40.7,19.3,21.4,4.8,1.3,13,...,0.35,8.0,0.15,4363.2,1119.1608,5.358969,40.216174,143,11,3076.618307
3,4,0.82,3.6,3481,80.4,38.0,42.5,9.5,2.7,27,...,0.35,17.0,0.15,8876.55,1964.3283,11.273688,83.093292,297,24,6645.043811
4,5,0.88,3.6,3414,166.2,77.3,88.8,19.3,5.6,56,...,0.0,0.0,0.15,0.0,175.1382,0.0,0.0,0,0,0.0


## Результат расчета

In [161]:
print('instant revenue      :', int(instant_rev))
print('perspective revenue  :', int(persp_rev_up))
print('revenue total        :', int(revenue_total))
print('costs:               :', int(instant_cost))
print('profit:              :', int(revenue_total - instant_cost))

instant revenue      : 4609
perspective revenue  : 12568
revenue total        : 17178
costs:               : 19151
profit:              : -1973


## Распределяем водителей по группам для тестов разного размера бонусов

распределяем водителей по группа для теста: конрольная и 3 тестовые

In [162]:
group_split = {
    "control": 0.25, # 25% of drivers go the control group
    "a": 0.25,
    "b": 0.25,
    "c": 0.25}

if sum(group_split.values()) != 1: print("split in test/control groups is NOT correct") 

распределяем долю бонусов от расчитанных, которые мы отдаем до тестов<br>
**control** - не даем бонусы<br>
**остальные** - 1: отдаем 100% расчитанных бонусов, 0.5% отдаем 50% и т.д.

In [163]:
bonuses_split = {
    "control": 0,
    "a": 1,       # 100% of target bonuses will be given to drivers
    "b": 0.8,     # 
    "c": 0.6}     # 

## Разбиваем водителей группы для теста

In [164]:
df_groups = pd.DataFrame(columns = ['id', 'groups'])
for cohort in range (1, cohorts+1):
    group_length = df[df['cohort'] == cohort].shape[0]
    control = ['control' for i in range(1, int(group_length * group_split['control']))]
    a = ['a' for i in range(1, int(group_length * group_split['a']))]
    b = ['b' for i in range(1, int(group_length * group_split['b']))]
    c = ['c' for i in range (1, group_length - len(control) - len(a) - len(b) + 1)]

    df_group = df[df['cohort'] == cohort]
    df_group['groups'] = control + a + b + c
    df_group = df_group[['id', 'groups']]
    df_groups = pd.concat([df_groups, df_group])
    
df = df.merge(df_groups, how = 'left', on = 'id')

result_df = df.merge(cohort_df, how='inner', on='cohort')[['id','cohort','driver_bonus','groups', 'rides_tbd']]

result_df['groups_coeff'] = result_df['groups'].apply(lambda x: bonuses_split[x])
result_df['bonus_to_offer'] = result_df['groups_coeff'] * result_df['driver_bonus']
result_df['bonus_to_offer'] = result_df['bonus_to_offer'].apply(lambda x: round(x,0))
result_df.sample(5)

Unnamed: 0,id,cohort,driver_bonus,groups,rides_tbd,groups_coeff,bonus_to_offer
13509,59775265,4,17.0,a,21,1.0,17.0
227,53204152,1,5.0,control,4,0.0,0.0
5695,79510735,2,4.0,control,6,0.0,0.0
10754,100742101,3,8.0,b,11,0.8,6.0
19002,64015032,5,0.0,c,0,0.6,0.0


## Смотрим на распределение водителеям по тестовым группам

In [165]:
print("РАЗБИВКА ВОДИТЕЛЕЙ ПО ГРУППАМ:")

print("\ntotal drivers:", df.shape[0])
for cohort in range (1, cohorts + 1):
    print(" cohort", cohort, ":", df[df['cohort'] == cohort].shape[0])
    for group in list(group_split.keys()):
        print("  ", group, ":", df[(df['cohort'] == cohort) & (df['groups'] == group)].shape[0])

print('\nПРИМЕР РАЗБИВКИ ПО КОГОРТАМ')
for cohort in range(1, cohorts):
    print('\ncohort:', cohort)
    print(result_df[result_df.cohort == cohort].groupby(['groups']).mean())

РАЗБИВКА ВОДИТЕЛЕЙ ПО ГРУППАМ:

total drivers: 19177
 cohort 1 : 4827
   control : 1205
   a : 1205
   b : 1205
   c : 1212
 cohort 2 : 3819
   control : 953
   a : 953
   b : 953
   c : 960
 cohort 3 : 3636
   control : 908
   a : 908
   b : 908
   c : 912
 cohort 4 : 3481
   control : 869
   a : 869
   b : 869
   c : 874
 cohort 5 : 3414
   control : 852
   a : 852
   b : 852
   c : 858

ПРИМЕР РАЗБИВКИ ПО КОГОРТАМ

cohort: 1
         cohort  driver_bonus  rides_tbd  groups_coeff  bonus_to_offer
groups                                                                
a           1.0           5.0        4.0           1.0             5.0
b           1.0           5.0        4.0           0.8             4.0
c           1.0           5.0        4.0           0.6             3.0
control     1.0           5.0        4.0           0.0             0.0

cohort: 2
         cohort  driver_bonus  rides_tbd  groups_coeff  bonus_to_offer
groups                                                      

## Сохранаяем расчеты в файл

название файл будет в формате "bonus_table_день_месяц_час_минуты"

In [166]:
file_name = "bonus_table_" + datetime.today().strftime('%d%m_%H%M') + '.csv'
result_df.to_csv(file_name, index=False)