In [86]:
import pandas as pd
import numpy as np
import pandasql as ps
from datetime import datetime, timedelta
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from pyhive import presto
from pymongo import MongoClient
import warnings
warnings.filterwarnings("ignore")

presto_conn = presto.connect(
    host='presto.processing.yoda.run',
    port=80,
    protocol='http',
    catalog='hive',
    username='mayank.jha@rapido.bike',
    # requests_kwargs=req_kw,
)

In [113]:
city = 'Hyderabad'

today = datetime.today()
idx = (today.weekday() + 1) % 7
sunday = today - timedelta(idx)
dt = pd.to_datetime(sunday).strftime('%Y%m%d')
print(dt)

20210919


In [114]:
level_query = """select * from datasets_internal.captain_level_classification where yyyymmdd = '{sd}' and city_name = '{ct}' """.format(sd = dt, ct = city)

df_captains = pd.read_sql(level_query, presto_conn)
print(len(df_captains))
df_captains.head()

249802


Unnamed: 0,captain_id,ltr,rides_target,apr,avg_ratings,cancel_rate,city_name,mode,captain_level,prev_captain_level,level_change_value,yyyymmdd
0,5853ceef417054710ef1c5b9,140,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919
1,58f5eca729320c5d07081dfb,179,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919
2,5909316fa9b45ce06d4e696b,136,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919
3,59ac332a5564bc9b1a5f8b9c,15,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919
4,59bbaad4b2d5ac993c94e11d,46,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919


In [93]:
mobile_query = """select mobilenumber, captainid from datasets.captain_single_view """.format(ct=city)

df_mobile = pd.read_sql(mobile_query, presto_conn)
print(len(df_mobile))

4478849


In [115]:
#df_mobile.to_csv('mobile_number.csv', index=False)
df_mobile = pd.read_csv('mobile_number.csv')

In [116]:
df_captains = df_captains.merge(df_mobile, how = 'left', left_on = 'captain_id', right_on = 'captainid')
df_captains.drop('captainid', axis = 1, inplace=True)
df_captains.head()

Unnamed: 0,captain_id,ltr,rides_target,apr,avg_ratings,cancel_rate,city_name,mode,captain_level,prev_captain_level,level_change_value,yyyymmdd,mobilenumber
0,5853ceef417054710ef1c5b9,140,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919,8125345539
1,58f5eca729320c5d07081dfb,179,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919,9553028964
2,5909316fa9b45ce06d4e696b,136,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919,9492078855
3,59ac332a5564bc9b1a5f8b9c,15,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919,8790297882
4,59bbaad4b2d5ac993c94e11d,46,0,0.0,0.0,0.0,Hyderabad,bike,1,1.0,0,20210919,9618668304


In [117]:
df_captains[df_captains['mobilenumber'].isna()]

Unnamed: 0,captain_id,ltr,rides_target,apr,avg_ratings,cancel_rate,city_name,mode,captain_level,prev_captain_level,level_change_value,yyyymmdd,mobilenumber


In [118]:
df_captains[(df_captains['rides_target'] > 0)]['captain_id'].nunique()

16837

In [119]:
df_captains['rides_target'].sum()

257761

In [120]:
df_captains = df_captains[['captain_id','mobilenumber','city_name','ltr','rides_target','apr','avg_ratings','cancel_rate','captain_level']]
df_captains = df_captains.rename(columns={'rides_target':'rides'})

In [121]:
#Remove captains in level 2 to 4 and also the captains in bronze level having 0 rides

df_bronze_captains = df_captains[(df_captains['captain_level'] == 1) & (df_captains['rides'] > 0)]

In [122]:
config_query = """select city, 
        displayname, 
        level, 
        ltr[1] as ltr, 
        rating[1] as rating,
        cancelcount[1] as cancel,
        weeklyrides[1] as rides,
        acceptancerate[1] as apr
    from 
        hive.experiments.captain_level_config clc
            left join legacy.cities c 
                on clc.city = c._id
    where displayname = '{ct}'
    order by level
        """.format(ct = city)

df_config = pd.read_sql(config_query, presto_conn)
print(len(df_config))
df_config.head()

4


Unnamed: 0,city,displayname,level,ltr,rating,cancel,rides,apr
0,5740135d4fdf4798208bba24,Hyderabad,1,0,0.0,1.0,0,0.0
1,5740135d4fdf4798208bba24,Hyderabad,2,20,4.0,0.25,10,0.3
2,5740135d4fdf4798208bba24,Hyderabad,3,150,4.3,0.2,20,0.4
3,5740135d4fdf4798208bba24,Hyderabad,4,300,4.5,0.1,45,0.7


In [123]:
df_final = df_bronze_captains.copy()
df_final['ltr_bronze'] = df_config.at[1, 'ltr']
df_final['rides_bronze'] = df_config.at[1, 'rides']
df_final['apr_bronze'] = df_config.at[1, 'apr']
df_final['rating_bronze'] = df_config.at[1, 'rating']
df_final['cancel_bronze'] = df_config.at[1, 'cancel']

In [124]:
df_final[['ltr_bronze','rides_bronze','apr_bronze','rating_bronze','cancel_bronze']] = df_final[['ltr_bronze','rides_bronze','apr_bronze','rating_bronze','cancel_bronze']].apply(pd.to_numeric)

In [125]:
df_final.head()

Unnamed: 0,captain_id,mobilenumber,city_name,ltr,rides,apr,avg_ratings,cancel_rate,captain_level,ltr_bronze,rides_bronze,apr_bronze,rating_bronze,cancel_bronze
28,5ae0dfde34e64e539ac5d3d0,6300269581,Hyderabad,262,8,0.67,4.0,0.0,1,20,10,0.3,4.0,0.25
56,5b8e5a8998cab328cc085a10,9133232213,Hyderabad,100,16,0.76,0.0,0.0,1,20,10,0.3,4.0,0.25
72,5ba9043cc5d31019d1cc7e2b,7287933786,Hyderabad,126,5,0.82,4.25,0.17,1,20,10,0.3,4.0,0.25
90,5bcef46bd3b4d34a58337e2e,7799894112,Hyderabad,227,1,0.5,0.0,0.0,1,20,10,0.3,4.0,0.25
117,5bfe65dad76a02356e923ae9,8074470331,Hyderabad,132,13,0.95,3.75,0.13,1,20,10,0.3,4.0,0.25


In [126]:
for index, row in df_final.iterrows():
    a = ''
    if row['ltr'] < row['ltr_bronze']:
        a = 'ltr_' 
    if row['rides'] < row['rides_bronze']:
        a = a + 'rides_' 
    if row['avg_ratings'] < row['rating_bronze']:
        a = a + 'rating_' 
    if row['apr'] < row['apr_bronze']:
        a = a + 'apr_' 
    if row['cancel_rate'] >= row['cancel_bronze']:
        a = a + 'cancel_'
    df_final.at[index, 'reason'] = a[:-1]

In [83]:
df_final['reason'].value_counts().to_csv('x.csv')

In [61]:
df_final.pivot_table(index='reason', values='rides', aggfunc='sum').sort_values('rides',ascending=False)

Unnamed: 0_level_0,rides
reason,Unnamed: 1_level_1
rating,20084
rides,10351
cancel,8308
rides_cancel,5992
rides_rating,4742
rating_apr,2581
apr,2230
rides_rating_cancel,2080
ltr_rides,1817
rating_cancel,1733


In [81]:
df_final[df_final['reason']=='']

Unnamed: 0,captain_id,mobilenumber,city_name,ltr,rides,apr,avg_ratings,cancel_rate,captain_level,ltr_bronze,rides_bronze,apr_bronze,rating_bronze,cancel_bronze,reason


In [127]:
### Consideration for LTR :

df_temp = df_final[df_final['reason'] == 'ltr']
df_temp['ltr_target'] = df_temp['ltr_bronze']
df_temp['possible_level'] = 2
df_temp['mobilenumber'].to_csv('x.csv', index=False)

In [129]:
### Consideration for cancellation rate :

df_temp = df_final[df_final['reason'] == 'cancel']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.rides >= ft.rides and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr")

#max_priority = data_sql_query[data_sql_query['priority'] == data_sql_query['priority'].max()]

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
print(len(df_temp))
print(df_comparison['level'].value_counts())

df_cancel = df_comparison.merge(df_config[['level','cancel']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'level':'possible_level','cancel':'cancel_target'})
df_cancel.head()
df_cancel['mobilenumber'].to_csv('x.csv', index=False)

649
2    573
3     76
Name: level, dtype: int64


In [128]:
### Consideration for rides :

df_temp = df_final[df_final['reason'] == 'rides']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating  and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

#max_priority = data_sql_query[data_sql_query['priority'] == data_sql_query['priority'].max()]

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
print(len(df_temp))
print(df_comparison['level'].value_counts())

df_rides = df_comparison.merge(df_config[['level','rides']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'level':'possible_level','rides_x':'rides','rides_y':'rides_target'})
df_rides.head()
df_rides['mobilenumber'].to_csv('x.csv', index=False)

2504
2    1568
3     662
4     274
Name: level, dtype: int64


In [130]:
### Consideration for rating :

df_temp = df_final[df_final['reason'] == 'rating']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.rides >= ft.rides  and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

#max_priority = data_sql_query[data_sql_query['priority'] == data_sql_query['priority'].max()]

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
print(len(df_temp))
print(df_comparison['level'].value_counts())

df_rating = df_comparison.merge(df_config[['level','rating']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'rating':'rating_target','level':'possible_level'})
df_rating.head()
df_rating['mobilenumber'].to_csv('x.csv', index=False)

942
2    673
3    241
4     28
Name: level, dtype: int64


In [61]:
### Consideration for apr :

df_temp = df_final[df_final['reason'] == 'apr']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.rides >= ft.rides and lt.avg_ratings >= ft.rating and lt.cancel_rate <= ft.cancel")

#max_priority = data_sql_query[data_sql_query['priority'] == data_sql_query['priority'].max()]

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
print(len(df_temp))
print(df_comparison['level'].value_counts())

df_rating = df_comparison.merge(df_config[['level','apr']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'apr_x':'apr','apr_y':'apr_target','level':'possible_level'})
df_rating.head()

68
2    62
4     4
8     2
Name: level, dtype: int64


Unnamed: 0,captain_id,city_name,ltr,rides,apr,avg_ratings,cancel_rate,captain_level,ltr_bronze,rides_bronze,apr_bronze,avg_ratings_bronze,cancel_rate_bronze,reason,possible_level,apr_target
0,5c24b0f244742d49fd43db11,Delhi,397,33,0.49,19.08,0.08,1,50,5,0.5,4.0,0.2,apr,4,0.6
1,5f900d5e685417b2a774c158,Delhi,107,15,0.24,16.73,0.12,1,50,5,0.5,4.0,0.2,apr,2,0.5
2,5d787f94a24c96105e464220,Delhi,470,8,0.39,14.67,0.11,1,50,5,0.5,4.0,0.2,apr,2,0.5
3,5c9e015db50ab0456be78361,Delhi,698,5,0.39,20.0,0.17,1,50,5,0.5,4.0,0.2,apr,2,0.5
4,58b96150314896c17b70e7eb,Delhi,165,26,0.36,20.0,0.04,1,50,5,0.5,4.0,0.2,apr,4,0.6


In [42]:
reason = 'ltr_rating'

df_temp = df_final[df_final['reason'] == reason]
df_temp['possible_level'] = 2

for r in reason.split('_'):
    df_temp[r+'_target'] = df_temp[r+'_bronze']
    
df_temp.to_csv('x.csv', index=False)

In [40]:
### Consideration for rides and cancel :

df_temp = df_final[df_final['reason'] == 'rides_cancel']
#calculate level for rides
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df1 = df_comparison.merge(df_config[['level','rides']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'rides_x':'rides','rides_y':'rides_target'})

#calculate level for cancel
df_temp = df_final[df_final['reason'] == 'rides_cancel']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.rides >= ft.rides")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df2 = df_comparison.merge(df_config[['level','cancel']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'cancel':'cancel_target'})

df1 = df1.merge(df2[['captain_id','level','cancel_target']], how = 'left', left_on = 'captain_id', right_on = 'captain_id').rename(columns={'cancel':'cancel_target'})
df1.head()

for index, row in df1.iterrows():
    if row['level_x'] == row['level_y'] == 1:
        df1.at[index, 'rides_target'] = df_config.at[row['level_x'],'rides']
        df1.at[index, 'cancel_target'] = df_config.at[row['level_x'],'cancel']
        df1.at[index, 'possible_level'] = 2 if row['level_x'] == 1 else row['level_x']
    elif row['level_x'] > row['level_y']:
        df1.at[index, 'cancel_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_x']
    else:
        df1.at[index, 'rides_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_y']

df1.drop(['level_x','level_y'], axis = 1, inplace=True)
df1.head()
df1.to_csv('x.csv', index=False)

In [109]:
### Consideration for ltr and cancel :

df_temp = df_final[df_final['reason'] == 'ltr_cancel']
#calculate level for ltr
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.rides >= ft.rides and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df1 = df_comparison.merge(df_config[['level','ltr']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'ltr_x':'ltr','ltr_y':'ltr_target'})

#calculate level for cancel
df_temp = df_final[df_final['reason'] == 'ltr_cancel']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.rides >= ft.rides")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df2 = df_comparison.merge(df_config[['level','cancel']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'cancel':'cancel_target'})

df1 = df1.merge(df2[['captain_id','level','cancel_target']], how = 'left', left_on = 'captain_id', right_on = 'captain_id').rename(columns={'cancel':'cancel_target'})
df1.head()

for index, row in df1.iterrows():
    if row['level_x'] == row['level_y']:
        df1.at[index, 'ltr_target'] = df_config.at[row['level_x'],'ltr']
        df1.at[index, 'cancel_target'] = df_config.at[row['level_x'],'cancel']
        df1.at[index, 'possible_level'] = row['level_x']
    elif row['level_x'] > row['level_y']:
        df1.at[index, 'cancel_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_x']
    else:
        df1.at[index, 'ltr_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_y']

df1.drop(['level_x','level_y'], axis = 1, inplace=True)
df1.head()
df1.to_csv('x.csv', index=False)

In [113]:
### Consideration for ltr and rides :

df_temp = df_final[df_final['reason'] == 'ltr_rides']
#calculate level for ltr
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.rides >= ft.rides and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df1 = df_comparison.merge(df_config[['level','ltr']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'ltr_x':'ltr','ltr_y':'ltr_target'})

df_temp = df_final[df_final['reason'] == 'ltr_rides']
#calculate level for rides
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df2 = df_comparison.merge(df_config[['level','rides']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'rides_x':'rides','rides_y':'rides_target'})

df1 = df1.merge(df2[['captain_id','level','rides_target']], how = 'left', left_on = 'captain_id', right_on = 'captain_id')
df1.head()

for index, row in df1.iterrows():
    if row['level_x'] == row['level_y']:
        df1.at[index, 'ltr_target'] = df_config.at[row['level_x'],'ltr']
        df1.at[index, 'rides_target'] = df_config.at[row['level_x'],'rides']
        df1.at[index, 'possible_level'] = row['level_x']
    elif row['level_x'] > row['level_y']:
        df1.at[index, 'rides_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_x']
    else:
        df1.at[index, 'ltr_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_y']

df1.drop(['level_x','level_y'], axis = 1, inplace=True)
df1.head()
df1.to_csv('x.csv', index=False)

In [27]:
### Consideration for rides and rating :

df_temp = df_final[df_final['reason'] == 'rides_rating']
#calculate level for rides
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.avg_ratings >= ft.rating and lt.apr >= ft.apr and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df1 = df_comparison.merge(df_config[['level','rides']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'rides_x':'rides','rides_y':'rides_target'})

#calculate level for cancel
df_temp = df_final[df_final['reason'] == 'rides_rating']
data_sql_query = ps.sqldf("select lt.*, ft.level from df_temp lt left join df_config ft on lt.ltr >= ft.ltr and lt.apr >= ft.apr and lt.rides >= ft.rides and lt.cancel_rate <= ft.cancel")

df_priority = data_sql_query.groupby(['captain_id']).agg({'level':'max'}).reset_index()
df_comparison = df_temp.merge(df_priority,how='left',left_on='captain_id',right_on='captain_id')
#print(len(df_temp))
#print(df_comparison['level'].value_counts())

df2 = df_comparison.merge(df_config[['level','rating']], how = 'left', left_on = 'level', right_on = 'level').rename(columns={'rating':'rating_target'})

df1 = df1.merge(df2[['captain_id','level','rating_target']], how = 'left', left_on = 'captain_id', right_on = 'captain_id').rename(columns={'rating':'rating_target'})
df1.head()

for index, row in df1.iterrows():
    if row['level_x'] == row['level_y']:
        df1.at[index, 'rides_target'] = df_config.at[row['level_x'],'rides']
        df1.at[index, 'rating_target'] = df_config.at[row['level_x'],'rating']
        df1.at[index, 'possible_level'] = row['level_x']
    elif row['level_x'] > row['level_y']:
        df1.at[index, 'rating_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_x']
    else:
        df1.at[index, 'rides_target'] = 'NA'
        df1.at[index, 'possible_level'] = row['level_y']

df1.drop(['level_x','level_y'], axis = 1, inplace=True)
df1.head()
df1.to_csv('x.csv', index=False)

In [19]:
y = """select distinct
        yyyymmdd, 
        profile_identity, 
        profile_phone,
        deviceInfo_make,
        deviceInfo_model,
        JSON_EXTRACT(x,'$.drm_device_id') as drm_device_id
    from 
        (
        select eventname, yyyymmdd, profile_identity, profile_phone, deviceInfo_make, deviceInfo_model,
            JSON_EXTRACT(JSON_PARSE(eventProps),'$.eventProps') as x from raw.clevertap_captain_events_master 
            where eventname = 'riderhomescreen' and yyyymmdd between '20210815' and '20210819' and length(profile_identity) = 24 
        ) """

x = pd.read_sql(y, presto_conn)
x.to_csv('x.csv', index=False)

In [17]:
x.to_csv('x.csv', index=False)

In [16]:
x.shape

(708815, 4)

In [9]:
x['drm_device_id'].nunique()

298709

In [10]:
x['profile_identity'].nunique()

297119

In [11]:
x['profile_phone'].nunique()

309782

In [13]:
x[x['profile_identity']=='']

Unnamed: 0,yyyymmdd,profile_identity,profile_phone,drm_device_id
