In [71]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from pandas.io import sql
import snowflake.connector
import keyring
import psycopg2 
import time
from datetime import date, timedelta
from scipy import stats

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_columns', 500)

from matplotlib import pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
sns.set_style("darkgrid")

In [72]:
snowflake_username = 'matthew.bessey@disneystreaming.com'

In [73]:
ctx = snowflake.connector.connect(authenticator='externalbrowser', 
                                  user=snowflake_username, 
                                  account='disneystreaming.us-east-1')

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [131]:
query= """
select *
from
  (  
    (
    select 
      o.swid
      , o.swid_holdout
      , e.*
      from "DSS_PROD"."DSS"."SFMC_ACCOUNT_SWID_MAP" a
      join oneid_combined o on a.swid = o.swid
      join "DSS_PROD"."DISNEY_PLUS"."DIM_DISNEY_DAILY_ACCOUNT_ENGAGEMENT" e on a.account_id = e.account_id
      where e.ds = '2020-01-27'
    )
    union
    (
    select 
      o.swid
      , o.swid_holdout
      , e.*
      from "DSS_PROD"."DSS"."SFMC_ACCOUNT_SWID_MAP" a
      join oneid_combined o on a.swid = o.swid
      join "DSS_PROD"."DISNEY_PLUS"."DIM_DISNEY_DAILY_ACCOUNT_ENGAGEMENT" e on a.account_id = e.account_id
      where e.ds = '2020-02-02'
    )
  )
where account_id in
(
  select 
  distinct e.account_id
  from "DSS_PROD"."DISNEY_PLUS"."DIM_DISNEY_DAILY_ACCOUNT_ENGAGEMENT" e
  where e.ds = '2020-01-27'
  and e.is_pre_launch != '1'
  and e.account_home_country = 'US'
  and e.first_account_subscription_signup_date is not NULL
  and is_entitled = '1'
  limit 500000
)
and swid_holdout < '243'
;
"""

In [132]:
# run the query and write to engagement
engagement = pd.read_sql(query,ctx)

In [133]:
# map columns to lowercase
engagement.columns = engagement.columns.str.lower()

In [134]:
# create function and apply for mapping of holdout groups on 'swid_holdout'
def holdout_grouping(df):
    if df['swid_holdout'] >= 246:
        return "holdout"
    elif df['swid_holdout'] >= 197 and df['swid_holdout'] <= 246:
        return "trending holdout"
    else:
        return "exposed"
    
engagement['marketing_holdout'] = engagement.apply(holdout_grouping,axis=1)

In [135]:
#drop rows w/ no entitlement data, rewrite as int
engagement = engagement.drop(engagement[engagement['is_entitled']=='unknown'].index,axis=0)
engagement.is_entitled = engagement.is_entitled.astype(int)

In [136]:
pre = engagement[engagement['ds'] == date(2020,1,27)]
post = engagement[engagement['ds'] == date(2020,2,2)]

In [137]:
# remove unnecessary columns
columnsToRemove = ['swid','swid_holdout','ds','account_id','is_flagged','first_account_subscription_signup_week',
    'last_account_subscription_signup_week','is_entitled_l1','is_entitled_l7','is_entitled_l28','is_entitled_itd',
    'is_pre_launch','is_pre_launch_nltt','account_home_country','subscription_state_upd','subscription_type',
    'total_login_days_l1','total_login_days_l28','total_streams_l1','total_streams_l28','num_streaming_profiles_l1',
    'num_streaming_profiles_l28','num_general_streaming_profiles_l1','num_general_streaming_profiles_l28',
    'num_kids_streaming_profiles_l1','num_kids_streaming_profiles_l28','num_streaming_devices_l1','num_streaming_devices_l28',
    'account_total_stream_days_l1','account_total_stream_days_l28','account_profile_total_stream_days_l1',
    'account_profile_total_stream_days_l28','total_stream_days_general_profiles_l1','total_stream_days_general_profiles_l28',
    'total_stream_days_kids_profiles_l1','total_stream_days_kids_profiles_l28','total_stream_time_ms_l1','total_stream_time_ms_l28',
    'total_stream_time_general_profiles_ms_l1','total_stream_time_general_profiles_ms_l28','total_stream_time_kids_profiles_ms_l1',
    'total_stream_time_kids_profiles_ms_l28','total_stream_time_web_ms_l1','total_stream_time_web_ms_l28','total_stream_time_mobile_ms_l1',
    'total_stream_time_mobile_ms_l28','total_stream_time_connected_tv_ms_l1','total_stream_time_connected_tv_ms_l28',
    'total_stream_time_unknown_ms_l1','total_stream_time_unknown_ms_l28','last_stream_date','account_profile_total_stream_days_l7',
    'account_profile_total_stream_days_itd','total_stream_time_unknown_ms_l7','total_stream_time_unknown_ms_itd'
]

preCleaned = pre.drop(columnsToRemove,axis=1)
postCleaned = post.drop(columnsToRemove,axis=1)

In [138]:
renamedColumns = ['first_signup_date','last_signup_date', 'is_entitled','ttl_login_days_l7', 'ttl_login_days_itd', 'ttl_streams_l7',
    'ttl_streams_itd', 'streaming_profiles_l7','streaming_profiles_itd', 'general_streaming_profiles_l7','general_streaming_profiles_itd', 
    'kids_streaming_profiles_l7','kids_streaming_profiles_itd', 'streaming_devices_l7','streaming_devices_itd', 'ttl_stream_days_l7',
    'ttl_stream_days_itd','ttl_stream_days_general_profiles_l7','ttl_stream_days_general_profiles_itd','ttl_stream_days_kids_profiles_l7',
    'ttl_stream_days_kids_profiles_itd', 'ttl_stream_time_ms_l7','ttl_stream_time_ms_itd', 'ttl_stream_time_general_profiles_ms_l7',
    'ttl_stream_time_general_profiles_ms_itd','ttl_stream_time_kids_profiles_ms_l7','ttl_stream_time_kids_profiles_ms_itd', 
    'ttl_stream_time_web_ms_l7','ttl_stream_time_web_ms_itd', 'ttl_stream_time_mobile_ms_l7','ttl_stream_time_mobile_ms_itd',
    'ttl_stream_time_connected_tv_ms_l7','tl_stream_time_connected_tv_ms_itd', 'days_since_last_stream','marketing_holdout'
]

preCleaned.columns = renamedColumns
postCleaned.columns = renamedColumns

In [139]:
# define columns for binary construction
# binary = 1 if action occurred in interval, else 0

binaryConstructionList = [
    'ttl_login_days_l7','ttl_login_days_itd','ttl_streams_l7','ttl_streams_itd','general_streaming_profiles_l7', 
    'general_streaming_profiles_itd','kids_streaming_profiles_l7', 'kids_streaming_profiles_itd','ttl_stream_time_web_ms_l7',
    'ttl_stream_time_web_ms_itd','ttl_stream_time_mobile_ms_l7','ttl_stream_time_mobile_ms_itd', 'ttl_stream_time_connected_tv_ms_l7',
    'tl_stream_time_connected_tv_ms_itd','days_since_last_stream'
]

In [140]:
# create binary variables _bin
for i in binaryConstructionList:
    preCleaned[i + '_bin'] = preCleaned.apply(lambda df:
                                   1 if df[i] > 0
                                   else 0,
                                   axis=1)
    postCleaned[i + '_bin'] = postCleaned.apply(lambda df:
                                   1 if df[i] > 0
                                   else 0,
                                   axis=1)

In [141]:
preCleaned.groupby('marketing_holdout').mean()

Unnamed: 0_level_0,is_entitled,ttl_login_days_l7,ttl_login_days_itd,ttl_streams_l7,ttl_streams_itd,streaming_profiles_l7,streaming_profiles_itd,general_streaming_profiles_l7,general_streaming_profiles_itd,kids_streaming_profiles_l7,kids_streaming_profiles_itd,streaming_devices_l7,streaming_devices_itd,ttl_stream_days_l7,ttl_stream_days_itd,ttl_stream_days_general_profiles_l7,ttl_stream_days_general_profiles_itd,ttl_stream_days_kids_profiles_l7,ttl_stream_days_kids_profiles_itd,ttl_stream_time_ms_l7,ttl_stream_time_ms_itd,ttl_stream_time_general_profiles_ms_l7,ttl_stream_time_general_profiles_ms_itd,ttl_stream_time_kids_profiles_ms_l7,ttl_stream_time_kids_profiles_ms_itd,ttl_stream_time_web_ms_l7,ttl_stream_time_web_ms_itd,ttl_stream_time_mobile_ms_l7,ttl_stream_time_mobile_ms_itd,ttl_stream_time_connected_tv_ms_l7,tl_stream_time_connected_tv_ms_itd,days_since_last_stream,ttl_login_days_l7_bin,ttl_login_days_itd_bin,ttl_streams_l7_bin,ttl_streams_itd_bin,general_streaming_profiles_l7_bin,general_streaming_profiles_itd_bin,kids_streaming_profiles_l7_bin,kids_streaming_profiles_itd_bin,ttl_stream_time_web_ms_l7_bin,ttl_stream_time_web_ms_itd_bin,ttl_stream_time_mobile_ms_l7_bin,ttl_stream_time_mobile_ms_itd_bin,ttl_stream_time_connected_tv_ms_l7_bin,tl_stream_time_connected_tv_ms_itd_bin,days_since_last_stream_bin
marketing_holdout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
exposed,1.0,4.052727,41.568901,12.019507,148.566748,0.873262,1.774025,0.82547,1.684557,0.048209,0.100711,1.197484,3.855177,2.21208,26.979651,2.408623,30.815455,0.137585,1.37544,19163470.0,258405700.0,18264750.0,248991000.0,898727.980137,9414646.0,798708.579846,12257750.0,4169407.0,56043670.0,13927680.0,184807300.0,8.20691,0.820559,1.0,0.639158,0.971168,0.629989,0.970729,0.041658,0.07687,0.062876,0.266016,0.293131,0.682124,0.488523,0.822099,0.676172
trending holdout,1.0,4.050903,41.494255,12.085874,149.467341,0.875235,1.781146,0.826666,1.690615,0.048943,0.101432,1.198502,3.837103,2.224679,26.996362,2.415282,30.805714,0.141502,1.382068,19233960.0,259279600.0,18283550.0,249600400.0,950404.554257,9679222.0,777658.559799,12218700.0,4166556.0,55927440.0,14023820.0,185815000.0,8.19295,0.820332,0.999989,0.641094,0.971511,0.631326,0.970843,0.042325,0.077511,0.063119,0.263309,0.293645,0.681323,0.488951,0.820842,0.674671


In [142]:
postCleaned.groupby('marketing_holdout').mean()

Unnamed: 0_level_0,is_entitled,ttl_login_days_l7,ttl_login_days_itd,ttl_streams_l7,ttl_streams_itd,streaming_profiles_l7,streaming_profiles_itd,general_streaming_profiles_l7,general_streaming_profiles_itd,kids_streaming_profiles_l7,kids_streaming_profiles_itd,streaming_devices_l7,streaming_devices_itd,ttl_stream_days_l7,ttl_stream_days_itd,ttl_stream_days_general_profiles_l7,ttl_stream_days_general_profiles_itd,ttl_stream_days_kids_profiles_l7,ttl_stream_days_kids_profiles_itd,ttl_stream_time_ms_l7,ttl_stream_time_ms_itd,ttl_stream_time_general_profiles_ms_l7,ttl_stream_time_general_profiles_ms_itd,ttl_stream_time_kids_profiles_ms_l7,ttl_stream_time_kids_profiles_ms_itd,ttl_stream_time_web_ms_l7,ttl_stream_time_web_ms_itd,ttl_stream_time_mobile_ms_l7,ttl_stream_time_mobile_ms_itd,ttl_stream_time_connected_tv_ms_l7,tl_stream_time_connected_tv_ms_itd,days_since_last_stream,ttl_login_days_l7_bin,ttl_login_days_itd_bin,ttl_streams_l7_bin,ttl_streams_itd_bin,general_streaming_profiles_l7_bin,general_streaming_profiles_itd_bin,kids_streaming_profiles_l7_bin,kids_streaming_profiles_itd_bin,ttl_stream_time_web_ms_l7_bin,ttl_stream_time_web_ms_itd_bin,ttl_stream_time_mobile_ms_l7_bin,ttl_stream_time_mobile_ms_itd_bin,ttl_stream_time_connected_tv_ms_l7_bin,tl_stream_time_connected_tv_ms_itd_bin,days_since_last_stream_bin
marketing_holdout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
exposed,0.981254,3.994616,44.995343,11.258667,158.270203,0.846044,1.792218,0.798654,1.700851,0.047816,0.103258,1.159068,3.933396,2.139027,28.824363,2.318349,32.814927,0.135358,1.49462,17909510.0,273907800.0,17046190.0,263730400.0,863327.104257,10177400.0,724963.652856,12884900.0,3884849.0,59388310.0,13049850.0,196123700.0,9.189447,0.803752,1.0,0.621287,0.97312,0.611827,0.972721,0.041372,0.078534,0.059056,0.270227,0.281928,0.688366,0.476096,0.826511,0.662197
trending holdout,0.981268,4.002901,44.926116,11.395213,159.291741,0.848934,1.799742,0.801589,1.707488,0.047821,0.103891,1.161288,3.915815,2.152517,28.852549,2.332128,32.817782,0.138023,1.50281,18079690.0,274936600.0,17190270.0,264474400.0,889419.93883,10462120.0,713062.287752,12835230.0,3867951.0,59264440.0,13239440.0,197293700.0,9.181887,0.80517,0.999989,0.622895,0.973721,0.613172,0.973075,0.041384,0.079142,0.058847,0.267582,0.283118,0.68751,0.477993,0.825329,0.664121


In [89]:
preTrending = preCleaned[preCleaned['marketing_holdout'] == 'exposed']
preHoldout = preCleaned[preCleaned['marketing_holdout'] == 'trending holdout']
postTrending = postCleaned[postCleaned['marketing_holdout'] == 'exposed']
postHoldout = postCleaned[postCleaned['marketing_holdout'] == 'trending holdout']

In [90]:
test_columns = [
    'is_entitled','ttl_login_days_l7', 'ttl_login_days_itd', 'ttl_streams_l7','ttl_streams_itd',
    'streaming_profiles_l7', 'streaming_profiles_itd','general_streaming_profiles_l7', 'general_streaming_profiles_itd',
    'kids_streaming_profiles_l7', 'kids_streaming_profiles_itd','streaming_devices_l7', 'streaming_devices_itd', 'ttl_stream_days_l7',
    'ttl_stream_days_itd', 'ttl_stream_days_general_profiles_l7','ttl_stream_days_general_profiles_itd','ttl_stream_days_kids_profiles_l7', 
    'ttl_stream_days_kids_profiles_itd','ttl_stream_time_ms_l7', 'ttl_stream_time_ms_itd','ttl_stream_time_general_profiles_ms_l7',
    'ttl_stream_time_general_profiles_ms_itd','ttl_stream_time_kids_profiles_ms_l7','ttl_stream_time_kids_profiles_ms_itd', 
    'ttl_stream_time_web_ms_l7','ttl_stream_time_web_ms_itd', 'ttl_stream_time_mobile_ms_l7','ttl_stream_time_mobile_ms_itd', 
    'ttl_stream_time_connected_tv_ms_l7','tl_stream_time_connected_tv_ms_itd','days_since_last_stream','ttl_login_days_l7_bin', 
    'ttl_login_days_itd_bin','ttl_streams_l7_bin','ttl_streams_itd_bin','general_streaming_profiles_l7_bin',
    'general_streaming_profiles_itd_bin','kids_streaming_profiles_l7_bin','kids_streaming_profiles_itd_bin', 'ttl_stream_time_web_ms_l7_bin',
    'ttl_stream_time_web_ms_itd_bin', 'ttl_stream_time_mobile_ms_l7_bin','ttl_stream_time_mobile_ms_itd_bin',
    'ttl_stream_time_connected_tv_ms_l7_bin','tl_stream_time_connected_tv_ms_itd_bin', 'days_since_last_stream_bin'
]

In [91]:
def run_p_test(exposed,holdout,test_columns):
    p_values = pd.DataFrame(columns=['p','diff','exposed_mean','holdout_mean'])

    for i in test_columns:
        try: 
            p1 = stats.ttest_ind(exposed[i], holdout[i])[1]
            try:
                diff = (exposed[i].mean()/holdout[i].mean()-1)*100
                exposed_mean = exposed[i].mean()
                holdout_mean = holdout[i].mean()
            except (ZeroDivisionError):
                diff = "Undefined"
                marketing_mean = "Undefined"
                holdout_mean = "Undefined"
        except (TypeError,RuntimeWarning): 
            p1 = "Broke!"
        mydict = {'name':i,'p':p1,'diff':diff,'exposed_mean':exposed_mean,'holdout_mean':holdout_mean}
        series = pd.Series(mydict)
        p_values = p_values.append(mydict,ignore_index=True)
        #print("Completed",i)

    p_values = p_values.set_index('name')
    return p_values


In [97]:
p_values1 = run_p_test(postTrending,postHoldout,test_columns)

In [98]:
p_values1

Unnamed: 0_level_0,p,diff,exposed_mean,holdout_mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
is_entitled,0.481829,-0.294974,0.7901958,0.7925336
ttl_login_days_l7,0.097592,-1.166632,3.383234,3.423169
ttl_login_days_itd,0.039276,-1.109683,40.56307,41.01824
ttl_streams_l7,0.4028,-1.513071,9.056352,9.195486
ttl_streams_itd,0.380873,-1.030026,137.7003,139.1334
streaming_profiles_l7,0.627032,-0.497277,0.686679,0.6901107
streaming_profiles_itd,0.704971,-0.253327,1.675159,1.679414
general_streaming_profiles_l7,0.854581,-0.185998,0.6494823,0.6506926
general_streaming_profiles_itd,0.781289,-0.180749,1.598592,1.601487
kids_streaming_profiles_l7,0.223395,-5.497069,0.03755452,0.039739
