In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import datetime
import time

from scipy.stats import binom, norm


In [40]:
## The Split Python SDK

In [41]:
from splitio import get_factory

In [42]:
api_key = '9ea0fre07gg0m9eneoh31r6romoi0crrs2g8'

In [43]:
config = {'ready' : 5000}

factory = get_factory(api_key, config=config)
split = factory.client()


## Mimic an AA test

## Randomizing user behaviour

In [46]:
booking_event_type = 'booking'
session_event = 'new_session'
traffic_type = 'user'
split_name = "SplitCheck_LongWait" 
version = '1'

attributes = dict()
attributes['country'] = 'uk'

In [47]:
def getSessions(binom_n = 50, binom_p = 0.01, verbose=False):
    ns = 1 + binom.rvs(binom_n, binom_p)
    if verbose:
        print (str(datetime.datetime.now()), ns)
    return ns

def getBookings(booking_n = 100, booking_p = 0.005, verbose=False):
    nb = binom.rvs(booking_n, booking_p)
    if verbose: 
        print (str(datetime.datetime.now()), nb)
    return nb
def getBookingValue(booking_value_mean = 100, booking_value_standard_deviation = 15, verbose=False):
    bv = norm.rvs(booking_value_mean, booking_value_standard_deviation)
    if verbose:
        print (str(datetime.datetime.now()), bv)
    return bv
    

In [48]:
n_users = 1000
bookings_df=pd.DataFrame(columns = ['userID', 'Treatment', 'sessionID', 'bookingID', 'bookingValue'])
sessions_df=pd.DataFrame(columns = ['userID', 'Treatment', 'sessionID'])

factory = get_factory(api_key, config=config)
split = factory.client()
        
for user_ID in tqdm(range(n_users)):
    attributes['userID'] = split_name+'V'+version+'_'+str(user_ID)

    try:        
        # assign user to treatment
        treatment = split.get_treatment(attributes['userID'], split_name, attributes)

    except TimeoutException:
       # The SDK failed to initialize in a second. Abort!
       sys.exit()
      
    # pick the number of sessions user has
    n_sessions = getSessions()
    
    for session_ID in range(n_sessions):
        # track a new user session
        trackEvent = split.track(attributes['userID'], traffic_type, session_event)
        
        # pick how many bookings the user makes in this session
        n_bookings = getBookings()
        
        session_data = {'userID':attributes['userID'], 
                        'Treatment':treatment,
                        'sessionID': attributes['userID']+'_'+str(session_ID), 
                        'bookings': n_bookings}

        sessions_df = sessions_df.append(session_data, ignore_index=True)        


        for booking_ID in range(n_bookings):
            
                # pick the value of the booking
                booking_value = getBookingValue()
                trackEvent = split.track(attributes['userID'], traffic_type, booking_event_type , booking_value)
    
                booking_data = {'userID':attributes['userID'], 
                                'Treatment':treatment,
                                'sessionID': attributes['userID']+'_'+str(session_ID), 
                                'bookingID': attributes['userID']+'_'+str(session_ID)+'_'+str(booking_ID),
                                'bookingValue': booking_value}
        
                bookings_df = bookings_df.append(booking_data, ignore_index=True)

print (time.strftime("%d/%m/%Y %H:%M:%S"))
time.sleep(1200)
print ('destroying @ ',time.strftime("%d/%m/%Y %H:%M:%S"))
split.destroy()                        

bookings_df.to_csv('bookings_df_V'+version+'_'+split_name+'.csv', index=False)        
sessions_df.to_csv('sessions_df_V'+version+'_'+split_name+'.csv', index=False)        

100%|██████████| 1000/1000 [00:03<00:00, 307.22it/s]


19/03/2019 14:47:10
destroying @  19/03/2019 15:07:10


In [49]:
sessions_df.groupby('Treatment').agg({'userID':'nunique','sessionID':'nunique', 'bookings': 'sum'})

Unnamed: 0_level_0,userID,sessionID,bookings
Treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
off,514,760,398.0
on,486,709,365.0


In [50]:
273/486.0

0.5617283950617284

Exception caught fetching split changes
Traceback (most recent call last):
  File "/Users/lizzieeardley/anaconda3/lib/python3.7/site-packages/urllib3/connection.py", line 159, in _new_conn
    (self._dns_host, self.port), self.timeout, **extra_kw)
  File "/Users/lizzieeardley/anaconda3/lib/python3.7/site-packages/urllib3/util/connection.py", line 57, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "/Users/lizzieeardley/anaconda3/lib/python3.7/socket.py", line 748, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 8] nodename nor servname provided, or not known

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/lizzieeardley/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 600, in urlopen
    chunked=chunked)
  File "/Users/lizzieeardley/anaconda3/lib/python3.7/site-packages/urll

In [30]:
getProportionMetric(sessions_df)

0.4908

In [28]:
bookings_df

Unnamed: 0,userID,Treatment,sessionID,bookingID,bookingValue
0,1,off,1_0,1_0_0,93.020805
1,2,on,2_0,2_0_0,117.101307
2,2,on,2_1,2_1_0,98.243049
3,2,on,2_2,2_2_0,97.500446
4,2,on,2_3,2_3_0,94.851755
5,3,on,3_0,3_0_0,101.632899
6,3,on,3_1,3_1_0,99.653563
7,4,off,4_1,4_1_0,96.606278
8,5,on,5_0,5_0_0,84.400376
9,6,off,6_0,6_0_0,94.699328


## Calculating the metrics manually

### Users that book
#### fraction of unique users with at least one booking 

In [22]:
def getProportionMetric(df_in, event='bookings'):
    user_df = df_in.groupby('userID').agg({'sessionID':'nunique', event:'sum'}).reset_index()
    user_df[event+'_user'] = user_df.apply(lambda x: 1 if x[event] > 0 else 0, axis=1)    
    proportionMetric = user_df[event+'_user'].sum()/user_df.userID.nunique()
    return proportionMetric


### Average Booking value per user

In [23]:
def getAverageValue(df_in, event = 'bookingID',  value='bookingValue'):
    user_df = df_in.groupby('userID').agg({ value:'sum', event:'nunique'}).reset_index()  
    user_df['averageValue'] = user_df.apply(lambda x: x[value] / x[event], axis=1)
    averageValue_variance = user_df['averageValue'].var()
    averageValue = user_df['averageValue'].mean()#/user_df[event].sum()    
    return averageValue, averageValue_variance

### Total Booking value per user

In [24]:
def getTotalValue(df_in, event = 'bookingID',  value='bookingValue'):
    user_df = df_in.groupby('userID').agg({ value:'sum'}).reset_index()  
    totalValue_variance = user_df[value].var()
    totalValue = user_df[value].mean()#/user_df[event].sum()    
    return totalValue, totalValue_variance

### Bookings per user

In [25]:
def getEventsPerUser(event_df,all_users_df, event = 'bookingID'):
    users = all_users_df.groupby('userID').agg({'sessionID':'min'}).reset_index()  
    events_per_user = event_df.groupby('userID').agg({ event:'nunique'}).reset_index() 
    combined_df = users.merge(events_per_user, how='left', on='userID')
    combined_df[event] = combined_df[event].fillna(0)
    
    eventsPerUser_variance = combined_df[event].var()
    eventsPerUser = combined_df[event].mean()
    return eventsPerUser, eventsPerUser_variance

### Bookings per session

In [26]:
def getEventsPerSession(event_df,all_users_df, denom_event = 'sessionID', num_event = 'bookingID'):
    sessions_per_user = all_users_df.groupby('userID').agg({ denom_event:'nunique'}).reset_index()  
    events_per_session = event_df.groupby('userID').agg({ num_event:'nunique'}).reset_index()      
    combined_df = sessions_per_user.merge(events_per_session, how='left', on='userID')
    combined_df[num_event] = combined_df[num_event].fillna(0)
    combined_df['events_per_session'] = combined_df[num_event] / combined_df[denom_event]

    eventsPerSession_variance = combined_df['events_per_session'].var()
    eventsPerSession = combined_df['events_per_session'].mean()
    return eventsPerSession, eventsPerSession_variance

### Total Bookings

In [34]:
def getAcrossBookings(event_df, event = 'bookingID'):
    return event_df[event].nunique()

## Evaluating results

In [35]:
treatments = ['on', 'off']

for treatment in treatments:
    print ('\n'+treatment+'\n')
    sessions_df_t = sessions_df.loc[sessions_df['Treatment']==treatment]    
    bookings_df_t = bookings_df.loc[bookings_df['Treatment']==treatment]
    print ('Users : ', sessions_df_t.userID.nunique())  
    print ('Users That Book : ', getProportionMetric(sessions_df_t))
    print ('Average Booking Value : ', getAverageValue(bookings_df_t))    
    print ('Total Booking Value : ', getTotalValue(bookings_df_t))     
    print ('Bookings Per User : ', getEventsPerUser(bookings_df_t, sessions_df_t)[0])     
    print ('Bookings Per Session : ', getEventsPerSession(bookings_df_t, sessions_df_t)[0])      
    print ('Total Bookings in Treatment : ', getAcrossBookings(bookings_df_t))
    print ('Total Sessions in Treatment : ', getAcrossBookings(sessions_df_t, event = 'sessionID'))

    



on

Users :  2499
Users That Book :  0.4841936774709884
Average Booking Value :  (99.7944596899719, 180.18608755275878)
Total Booking Value :  (146.2340956909665, 6194.781543324574)
Bookings Per User :  0.709483793517407
Bookings Per Session :  0.47005468854208327
Total Bookings in Treatment :  1773
Total Sessions in Treatment :  3733

off

Users :  2501
Users That Book :  0.4974010395841663
Average Booking Value :  (100.04356750137678, 182.67144347836975)
Total Booking Value :  (146.971204938461, 5702.677243642334)
Bookings Per User :  0.7313074770091963
Bookings Per Session :  0.4916033586565372
Total Bookings in Treatment :  1829
Total Sessions in Treatment :  3751


In [36]:
print (1714 / 1773.0) 
print (1786 / 1829.0)

print (3635 / 3733.0)

print (3650 / 3751.0)

0.9667230682459109
0.9764898851831602
0.9737476560407179
0.9730738469741402


In [65]:
off_b = bookings_df.loc[bookings_df.Treatment=='off']
off_b.head()

Unnamed: 0,userID,Treatment,sessionID,bookingID,bookingValue
0,6,off,6_3,6_3_0,77.939915
2,14,off,14_0,14_0_0,111.906032
3,16,off,16_0,16_0_0,99.36149
4,16,off,16_1,16_1_0,92.817026
8,21,off,21_0,21_0_0,90.131066


In [40]:
sessions_df.loc[sessions_df.bookings>0].head()

Unnamed: 0,userID,Treatment,sessionID,bookings
9,6,off,6_3,1.0
14,9,on,9_0,1.0
20,14,off,14_0,1.0
22,16,off,16_0,1.0
23,16,off,16_1,1.0


In [46]:
sessions_df_on = sessions_df.loc[sessions_df.Treatment=='on']

In [49]:
user_df_on = sessions_df_on.groupby('userID').agg({'sessionID':'nunique', 'bookings':'sum', 'Treatment':'first'}).reset_index()
user_df_on['booking_user'] = user_df_on.apply(lambda x: 1 if x.bookings > 0 else 0, axis=1)

In [50]:
user_df_on.head(10)

Unnamed: 0,userID,sessionID,bookings,Treatment,booking_user
0,0,1,0.0,on,0
1,1,1,0.0,on,0
2,2,1,0.0,on,0
3,3,1,0.0,on,0
4,4,1,0.0,on,0
5,5,1,0.0,on,0
6,7,2,0.0,on,0
7,8,2,0.0,on,0
8,9,1,1.0,on,1
9,10,2,0.0,on,0


In [52]:
user_that_book = user_df_on.booking_user.sum()/user_df_on.userID.nunique()
print (user_that_book)

0.47578947368421054


In [20]:
#test for speed
factory = get_factory(api_key, config=config)
split = factory.client()
verbose_str = False        
n_users = 5
bookings_df_test=pd.DataFrame(columns = ['userID', 'Treatment', 'sessionID', 'bookingID', 'bookingValue'])#, index=[x for x in range(n_users)])
sessions_df_test=pd.DataFrame(columns = ['userID', 'Treatment', 'sessionID'])#, index=[x for x in range(n_users)])

for user_ID in tqdm(range(n_users)):
    if verbose_str == True:  print(str(datetime.datetime.now()), user_ID, 'start')
    attributes['userID'] = str(user_ID)

    try:
        # assign user to treatment
        treatment = split.get_treatment(attributes['userID'], split_name, attributes)

    except TimeoutException:
       # The SDK failed to initialize in a second. Abort!
       sys.exit()
      
    # pick the number of sessions user has
    n_sessions = getSessions(verbose=verbose_str)
    
    for session_ID in range(n_sessions):
        # track a new user session
        trackEvent = split.track(attributes['userID'], traffic_type, session_event)
        
        # pick how many bookings the user makes in this session
        n_bookings = getBookings(verbose=verbose_str)
        
        session_data = {'userID':user_ID, 
                        'Treatment':treatment,
                        'sessionID': str(user_ID)+'_'+str(session_ID), 
                        'bookings': n_bookings}

        sessions_df_test = sessions_df_test.append(session_data, ignore_index=True)        


        for booking_ID in range(n_bookings):
            
                # pick the value of the booking
                booking_value = getBookingValue(verbose=verbose_str)
                trackEvent = split.track(attributes['userID'], traffic_type, booking_event_type , booking_value)
    
                booking_data = {'userID':user_ID, 
                                'Treatment':treatment,
                                'sessionID': str(user_ID)+'_'+str(session_ID), 
                                'bookingID': str(user_ID)+'_'+str(session_ID)+'_'+str(booking_ID),
                                'bookingValue': booking_value}
        
                bookings_df_test = bookings_df_test.append(booking_data, ignore_index=True)
                
    if verbose_str == True: print (str(datetime.datetime.now()), user_ID, 'end')

100%|██████████| 5/5 [00:00<00:00, 201.03it/s]
