In [112]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, date
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pyhive import presto
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")
import my_func

presto_conn = presto.connect(
    host='bi-presto.serving.data.production.internal',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [113]:
dt = date.today()
dt = (pd.to_datetime(dt) - timedelta(dt.weekday())).strftime('%Y-%m-%d')
dt

'2022-05-09'

In [118]:
# Fetch Subscription Data

def get_subscribed_captains(start_date, end_date):
    print("fetching subscribed captains for : ", start_date, 'to ',end_date)
    orders_query = """   select s.userId as captain_id,
                                c.displayname as city,
                                s.yyyymmdd as purchase_date,
                                s.amount,
                                date_format(date_add('day', 1, cast(from_unixtime(cast(s.startTime as bigint)) as date)),'%Y-%m-%d') as start_time,
                                date_format(from_unixtime(cast(s.endTime as bigint)),'%Y-%m-%d') as end_time
                            from raw.kafka_captain_subscriptions_v1_immutable s
                                left join legacy.cities c
                                    on s.city = c._id
                            where date_format(date_add('day', 1, cast(from_unixtime(cast(s.startTime as bigint)) as date)),'%Y-%m-%d') between '{sd}' and '{ed}'
               """.format(sd = start_date, ed = end_date)

    tmp = pd.read_sql(orders_query, presto_conn)
    print(" fetched subscription data : ", len(tmp))

    return tmp

subscribed_captains = get_subscribed_captains(dt, (pd.to_datetime(dt) + timedelta(6)).strftime('%Y-%m-%d'))
subscribed_captains['week'] = pd.to_datetime(dt).strftime('%Y-%V')

subscribed_captains.head()

fetching subscribed captains for :  2022-05-09 to  2022-05-15
 fetched subscription data :  313


Unnamed: 0,captain_id,city,purchase_date,amount,start_time,end_time,week
0,5c2a54074a267149c76226df,Kolkata,20220507,1,2022-05-09,2022-05-15,2022-19
1,5f4487dc7f36b68c8a2a36a0,Delhi,20220508,59,2022-05-09,2022-05-15,2022-19
2,5d6fe5526012fb46f2f3e55b,Kolkata,20220508,100,2022-05-09,2022-05-15,2022-19
3,5d3aa12c2f470b5bbc1fa9f4,Kolkata,20220508,1,2022-05-09,2022-05-15,2022-19
4,5d48382c55fbf50d45e49276,Kolkata,20220508,1,2022-05-09,2022-05-15,2022-19


In [119]:
city = tuple(subscribed_captains.city.unique())
city

('Kolkata', 'Delhi', 'Ahmedabad')

In [120]:
# Fetch POC Segments

def get_poc(start_date, city):
    print("fetching poc for : ", start_date)
    orders_query = """   select captain_id,
                                coalesce(nullif(substr(recency_segment, 1, strpos(recency_segment, '_')-1),''),'churn') as recency,
                                segment
                            from 
                                datasets.captain_quality_segments_snapshot_immutable 
                            where yyyymmdd = '{sd}'
                                and city_name in {ct}
                                       """.format(sd = start_date.replace('-',''), ct = city)

    tmp = pd.read_sql(orders_query, presto_conn)
    print(" fetched poc data : ", len(tmp))

    return tmp


poc_segment = get_poc((pd.to_datetime(dt) - timedelta(1)).strftime('%Y-%m-%d'), city)

poc_segment.head()

fetching poc for :  2022-05-08
 fetched poc data :  127160


Unnamed: 0,captain_id,recency,segment
0,5c1cf751d9bacb2f6f9baeed,inactive,LP_Inter_HO
1,5c2cec5b4a267149c763c27c,churn,ZP
2,5c2f614944742d49fd47913b,active,LP_D_UHO
3,5c4ea94f9359a37c0dbc42ef,inactive,LP_Intra_LO
4,5c5d7f7af2edc7336752096c,churn,ZP


In [132]:
# Fetch Captains Performance

def get_captains_performance(start_date, end_date, city):
    print("fetching captains performance for : ", start_date, 'to ',end_date)
    orders_query = """   select captainid as captain_id,
                            city,
                            count(*) as active_days,
                            sum(net_orders) as orders,
                            sum(accepted_pings + riderbusy_pings + riderrejected_pings) as total_pings,
                            sum(accepted_pings) as accepted_pings,
                            sum(riderrejected_pings  + cicc_pings) as cancelled_pings,
                            sum(total_login_hr) as login_hours,
                            sum(order_earning) as order_earnings,
                            sum(ocara_customer_cancelled) as ocara_customer_cancelled,
                            sum(ocara_rider_cancelled) as ocara_rider_cancelled,
                            sum(cicc_pings) as cicc_pings
                            
                        from datasets.captain_svo_daily_kpi
                        where service_name = 'Link'
                            and city in {ct}
                            and yyyymmdd between '{sd}' and '{ed}'
                        group by 1,2
                                       """.format(sd = start_date.replace('-',''), ed = end_date.replace('-',''), ct = city)

    tmp = pd.read_sql(orders_query, presto_conn)
    print(" fetched captains performance data : ", len(tmp))

    return tmp


captains_performance = get_captains_performance(dt, (pd.to_datetime(dt) + timedelta(6)).strftime('%Y-%m-%d'), city)

captains_performance.head()

fetching captains performance for :  2022-05-09 to  2022-05-15
 fetched captains performance data :  53162


Unnamed: 0,captain_id,city,active_days,orders,total_pings,accepted_pings,cancelled_pings,login_hours,order_earnings,ocara_customer_cancelled,ocara_rider_cancelled,cicc_pings
0,601d7ee27143239930c910a6,Delhi,2,0,2,1,0,1.063056,0.0,0,1,0
1,5d1778d493686945e70a799d,Delhi,3,26,116,33,31,16.645555,1116.9595,8,3,0
2,5e421f806bbfd46517dfaf15,Delhi,3,9,26,14,11,22.633611,446.9555,4,1,1
3,5f6d7fe54df9b4034f417d79,Delhi,2,0,5,2,0,1.286111,0.0,1,0,0
4,5c5c148c03a62d32bb1de8a3,Kolkata,3,39,64,52,17,17.179167,1820.968,11,1,6


In [133]:
captains_performance['apr'] = captains_performance['accepted_pings']/captains_performance['total_pings']
captains_performance['dpr'] = captains_performance['orders']/captains_performance['total_pings']
captains_performance['ccr'] = captains_performance['cancelled_pings']/captains_performance['total_pings']

In [134]:
df_captains = captains_performance.merge(poc_segment, how = 'left', on = 'captain_id').merge(
                                            subscribed_captains.drop('city', axis=1), how = 'left', on = 'captain_id')

In [135]:
df_captains = df_captains[df_captains['orders']>0]

In [136]:
df_captains['subscription_group'] = df_captains['amount'].apply(lambda x : 'non_subscribed' if pd.isna(x) else 'subscription_'+x)

In [137]:
df_captains['segment'].fillna('HH', inplace=True)

In [138]:
df_captains['start_time'].fillna(dt, inplace=True)
df_captains['end_time'].fillna((pd.to_datetime(dt) + timedelta(6)).strftime('%Y-%m-%d'), inplace=True)

In [143]:
df_captains.pivot_table(index = ['city','subscription_group','segment','start_time','end_time','week'], 
                        values = ['active_days','total_pings','accepted_pings','cancelled_pings','orders','ocara_customer_cancelled','ocara_rider_cancelled','cicc_pings','login_hours','order_earnings','apr','dpr','ccr'],
                       aggfunc = 'mean').reset_index().to_csv('x.csv', index=False)