In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from pandas.io import sql
import snowflake.connector
import keyring
import psycopg2 
import time
from datetime import date, timedelta
from scipy import stats

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_columns', 500)

from matplotlib import pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
sns.set_style("darkgrid")

In [2]:
snowflake_username = 'matthew.bessey@disneystreaming.com'

In [3]:
ctx = snowflake.connector.connect(authenticator='externalbrowser', 
                                  user=snowflake_username, 
                                  account='disneystreaming.us-east-1')

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [5]:
# set date parameters for query

subscription_start_date_min = "'2019-11-12'"
subscription_start_date_max = "'2019-11-19'" # max of subscription start date
engagement_date = "'2020-01-20'" # date for which we want to pull engagement behaviors

In [45]:
query= """
select o.swid
, o.swid_holdout
, a.accountid
, s.subscription_id
, e.*
from subscription s
join account a on s.account_id = a.accountid
join oneid_combined o on a.swid = o.swid
join "DSS_PROD"."DISNEY_PLUS"."DIM_DISNEY_DAILY_ACCOUNT_ENGAGEMENT" e on a.accountid = e.account_id
where s.partner = 'disney'
and s.calculated_subscription_start_dtm >= {}
and s.CALCULATED_SUBSCRIPTION_START_DTM <= {}
--and s.is_entitled = 1
and e.ds = {}
and e.is_pre_launch != 1
limit 1000000;
""".format(subscription_start_date_min, subscription_start_date_max, engagement_date)`

In [46]:
# run the query and write to engagement
engagement = pd.read_sql(query,ctx)

In [47]:
# map columns to lowercase
engagement.columns = engagement.columns.str.lower()

In [48]:
# create function and apply for mapping of holdout groups on 'swid_holdout'
def holdout_grouping(df):
    if df['swid_holdout'] < 243:
        return "all marketing"
    elif df['swid_holdout'] >= 243 and df['swid_holdout'] < 246:
        return "no onboarding"
    else:
        return "no marketing"
    
engagement['marketing_holdout'] = engagement.apply(holdout_grouping,axis=1)

In [49]:
#drop rows w/ no entitlement data, rewrite as int
engagement = engagement.drop(engagement[engagement['is_entitled']=='unknown'].index,axis=0)
engagement.is_entitled = engagement.is_entitled.astype(int)

In [50]:
# remove non-US countries and then country column
countryUS_filter = engagement['account_home_country'] == 'US'
engagementCleaned = engagement[countryUS_filter]
engagementCleaned = engagementCleaned.drop('account_home_country',axis=1)

In [51]:
# remove unnecessary columns
columnsToRemove = [        
    'swid',
    'swid_holdout',
    'accountid',
    'subscription_id',
    'ds',
    'account_id',
    'is_flagged',
    'first_account_subscription_signup_week',
    'last_account_subscription_signup_week',
    'is_entitled_l1',
    'is_entitled_l7',
    'is_entitled_l28',
    'is_entitled_itd',
    'is_pre_launch',
    'is_pre_launch_nltt',
    'account_home_country',
    'subscription_state_upd',
    'subscription_type',
    'total_login_days_l1',
    'total_login_days_l28',
    'total_streams_l1',
    'total_streams_l28',
    'num_streaming_profiles_l1',
    'num_streaming_profiles_l28',
    'num_general_streaming_profiles_l1',
    'num_general_streaming_profiles_l28',
    'num_kids_streaming_profiles_l1',
    'num_kids_streaming_profiles_l28',
    'num_streaming_devices_l1',
    'num_streaming_devices_l28',
    'account_total_stream_days_l1',
    'account_total_stream_days_l28',
    'account_profile_total_stream_days_l1',
    'account_profile_total_stream_days_l28',
    'total_stream_days_general_profiles_l1',
    'total_stream_days_general_profiles_l28',
    'total_stream_days_kids_profiles_l1',
    'total_stream_days_kids_profiles_l28',
    'total_stream_time_ms_l1',
    'total_stream_time_ms_l28',
    'total_stream_time_general_profiles_ms_l1',
    'total_stream_time_general_profiles_ms_l28',
    'total_stream_time_kids_profiles_ms_l1',
    'total_stream_time_kids_profiles_ms_l28',
    'total_stream_time_web_ms_l1',
    'total_stream_time_web_ms_l28',
    'total_stream_time_mobile_ms_l1',
    'total_stream_time_mobile_ms_l28',
    'total_stream_time_connected_tv_ms_l1',
    'total_stream_time_connected_tv_ms_l28',
    'total_stream_time_unknown_ms_l1',
    'total_stream_time_unknown_ms_l28',
    'last_stream_date',
    'account_profile_total_stream_days_l7',
    'account_profile_total_stream_days_itd',
    'total_stream_time_unknown_ms_l7',
    'total_stream_time_unknown_ms_itd',
]

engagementCleaned = engagement.drop(columnsToRemove,axis=1)

In [52]:
renamedColumns = [
        'first_signup_date',
    'last_signup_date', 
    'is_entitled',
    'ttl_login_days_l7', 
    'ttl_login_days_itd', 
    'ttl_streams_l7',
    'ttl_streams_itd', 
    'streaming_profiles_l7',
    'streaming_profiles_itd', 
    'general_streaming_profiles_l7',
    'general_streaming_profiles_itd', 
    'kids_streaming_profiles_l7',
    'kids_streaming_profiles_itd', 
    'streaming_devices_l7',
    'streaming_devices_itd', 
    'ttl_stream_days_l7',
    'ttl_stream_days_itd',
    'ttl_stream_days_general_profiles_l7',
    'ttl_stream_days_general_profiles_itd',
    'ttl_stream_days_kids_profiles_l7',
    'ttl_stream_days_kids_profiles_itd', 
    'ttl_stream_time_ms_l7',
    'ttl_stream_time_ms_itd', 
    'ttl_stream_time_general_profiles_ms_l7',
    'ttl_stream_time_general_profiles_ms_itd',
    'ttl_stream_time_kids_profiles_ms_l7',
    'ttl_stream_time_kids_profiles_ms_itd', 
    'ttl_stream_time_web_ms_l7',
    'ttl_stream_time_web_ms_itd', 
    'ttl_stream_time_mobile_ms_l7',
    'ttl_stream_time_mobile_ms_itd',
    'ttl_stream_time_connected_tv_ms_l7',
    'tl_stream_time_connected_tv_ms_itd', 
    'days_since_last_stream',
    'marketing_holdout'
]

engagementCleaned.columns = renamedColumns

In [53]:
# rename for conciseness
engmt = engagementCleaned
del engagementCleaned
del engagement

In [54]:
# define columns for binary construction
# binary = 1 if action occurred in interval, else 0

binaryConstructionList = [
    'ttl_login_days_l7',
    'ttl_login_days_itd',
    'ttl_streams_l7',
    'ttl_streams_itd',
    'general_streaming_profiles_l7', 'general_streaming_profiles_itd',
    'kids_streaming_profiles_l7', 'kids_streaming_profiles_itd',
    'ttl_stream_time_web_ms_l7',
    'ttl_stream_time_web_ms_itd', 
    'ttl_stream_time_mobile_ms_l7',
    'ttl_stream_time_mobile_ms_itd', 
    'ttl_stream_time_connected_tv_ms_l7',
    'tl_stream_time_connected_tv_ms_itd', 
    'days_since_last_stream'
]

In [55]:
# create binary variables _bin
for i in binaryConstructionList:
    engmt[i + '_bin'] = engmt.apply(lambda df:
                                   1 if df[i] > 0
                                   else 0,
                                   axis=1)

In [56]:
engmt.groupby('marketing_holdout').mean()

Unnamed: 0_level_0,is_entitled,ttl_login_days_l7,ttl_login_days_itd,ttl_streams_l7,ttl_streams_itd,streaming_profiles_l7,streaming_profiles_itd,general_streaming_profiles_l7,general_streaming_profiles_itd,kids_streaming_profiles_l7,kids_streaming_profiles_itd,streaming_devices_l7,streaming_devices_itd,ttl_stream_days_l7,ttl_stream_days_itd,ttl_stream_days_general_profiles_l7,ttl_stream_days_general_profiles_itd,ttl_stream_days_kids_profiles_l7,ttl_stream_days_kids_profiles_itd,ttl_stream_time_ms_l7,ttl_stream_time_ms_itd,ttl_stream_time_general_profiles_ms_l7,ttl_stream_time_general_profiles_ms_itd,ttl_stream_time_kids_profiles_ms_l7,ttl_stream_time_kids_profiles_ms_itd,ttl_stream_time_web_ms_l7,ttl_stream_time_web_ms_itd,ttl_stream_time_mobile_ms_l7,ttl_stream_time_mobile_ms_itd,ttl_stream_time_connected_tv_ms_l7,tl_stream_time_connected_tv_ms_itd,days_since_last_stream,ttl_login_days_l7_bin,ttl_login_days_itd_bin,ttl_streams_l7_bin,ttl_streams_itd_bin,general_streaming_profiles_l7_bin,general_streaming_profiles_itd_bin,kids_streaming_profiles_l7_bin,kids_streaming_profiles_itd_bin,ttl_stream_time_web_ms_l7_bin,ttl_stream_time_web_ms_itd_bin,ttl_stream_time_mobile_ms_l7_bin,ttl_stream_time_mobile_ms_itd_bin,ttl_stream_time_connected_tv_ms_l7_bin,tl_stream_time_connected_tv_ms_itd_bin,days_since_last_stream_bin
marketing_holdout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
all marketing,0.841875,3.934847,44.40716,12.055239,167.298378,0.881907,1.92674,0.825926,1.82016,0.056464,0.119438,1.231052,4.310684,2.162608,29.425761,2.398445,34.297271,0.163773,1.69731,19598560.0,292059000.0,18484870.0,280284700.0,1113686.0,11774310.0,0.0,0.0,0.0,0.0,0.0,0.0,12.905421,0.769587,1.0,0.600434,0.981437,0.591179,0.98116,0.048876,0.091367,0.0,0.0,0.0,0.0,0.0,0.0,0.666117
no marketing,0.839362,3.905069,44.081291,12.010605,166.123428,0.882329,1.924699,0.82606,1.818307,0.056709,0.11972,1.2251,4.291274,2.139064,29.211591,2.385738,34.110748,0.158382,1.667652,19450980.0,289892100.0,18371050.0,278067400.0,1079930.0,11824680.0,0.0,0.0,0.0,0.0,0.0,0.0,13.078399,0.764839,1.0,0.598029,0.980993,0.588513,0.98063,0.048204,0.089848,0.0,0.0,0.0,0.0,0.0,0.0,0.668378
no onboarding,0.836891,3.905578,44.094853,11.796437,166.200207,0.880875,1.914873,0.824583,1.80952,0.056894,0.116888,1.232828,4.304269,2.149768,29.240403,2.391548,34.178516,0.154588,1.618093,19405480.0,290352000.0,18422520.0,279818200.0,982956.2,10533710.0,0.0,0.0,0.0,0.0,0.0,0.0,12.818621,0.763298,1.0,0.599415,0.979945,0.591582,0.979687,0.048115,0.087967,0.0,0.0,0.0,0.0,0.0,0.0,0.663625


In [57]:
(engmt.groupby('marketing_holdout').mean().iloc[0,]
 /engmt.groupby('marketing_holdout').mean().iloc[1,]-1)*100

is_entitled                                0.299391
ttl_login_days_l7                          0.762536
ttl_login_days_itd                         0.739246
ttl_streams_l7                             0.371621
ttl_streams_itd                            0.707276
streaming_profiles_l7                     -0.047787
streaming_profiles_itd                     0.106048
general_streaming_profiles_l7             -0.016211
general_streaming_profiles_itd             0.101933
kids_streaming_profiles_l7                -0.432153
kids_streaming_profiles_itd               -0.235370
streaming_devices_l7                       0.485832
streaming_devices_itd                      0.452306
ttl_stream_days_l7                         1.100661
ttl_stream_days_itd                        0.733167
ttl_stream_days_general_profiles_l7        0.532622
ttl_stream_days_general_profiles_itd       0.546817
ttl_stream_days_kids_profiles_l7           3.403649
ttl_stream_days_kids_profiles_itd          1.778451
ttl_stream_t

In [19]:
engmt.head()

Unnamed: 0,first_signup_date,last_signup_date,is_entitled,ttl_login_days_l7,ttl_login_days_itd,ttl_streams_l7,ttl_streams_itd,streaming_profiles_l7,streaming_profiles_itd,general_streaming_profiles_l7,general_streaming_profiles_itd,kids_streaming_profiles_l7,kids_streaming_profiles_itd,streaming_devices_l7,streaming_devices_itd,ttl_stream_days_l7,ttl_stream_days_itd,ttl_stream_days_general_profiles_l7,ttl_stream_days_general_profiles_itd,ttl_stream_days_kids_profiles_l7,ttl_stream_days_kids_profiles_itd,ttl_stream_time_ms_l7,ttl_stream_time_ms_itd,ttl_stream_time_general_profiles_ms_l7,ttl_stream_time_general_profiles_ms_itd,ttl_stream_time_kids_profiles_ms_l7,ttl_stream_time_kids_profiles_ms_itd,ttl_stream_time_web_ms_l7,ttl_stream_time_web_ms_itd,ttl_stream_time_mobile_ms_l7,ttl_stream_time_mobile_ms_itd,ttl_stream_time_connected_tv_ms_l7,tl_stream_time_connected_tv_ms_itd,days_since_last_stream,marketing_holdout,ttl_login_days_l7_bin,ttl_login_days_itd_bin,ttl_streams_l7_bin,ttl_streams_itd_bin,general_streaming_profiles_l7_bin,general_streaming_profiles_itd_bin,kids_streaming_profiles_l7_bin,kids_streaming_profiles_itd_bin,ttl_stream_time_web_ms_l7_bin,ttl_stream_time_web_ms_itd_bin,ttl_stream_time_mobile_ms_l7_bin,ttl_stream_time_mobile_ms_itd_bin,ttl_stream_time_connected_tv_ms_l7_bin,tl_stream_time_connected_tv_ms_itd_bin,days_since_last_stream_bin
0,2019-11-13,2019-11-13,1,2,32,2,53,1,1,1,1,0,0,1,3,2,30,2,30,0,0,4822861,134781569,4822861,134781569,0,0,0,0,0,0,0,0,1.0,all marketing,1,1,1,1,1,1,0,0,0,0,0,0,0,0,1
1,2019-11-13,2019-11-13,1,7,51,95,635,1,1,1,1,0,0,1,2,7,46,7,46,0,0,132259783,832465844,132259783,832465844,0,0,0,0,0,0,0,0,0.0,all marketing,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0
2,2019-11-12,2019-11-12,1,0,45,0,173,0,1,0,1,0,0,0,4,0,40,0,40,0,0,0,418585135,0,418585135,0,0,0,0,0,0,0,0,9.0,all marketing,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1
3,2019-11-12,2019-11-12,0,0,36,0,293,0,1,0,1,0,0,0,2,0,31,0,31,0,0,0,414577979,0,414577979,0,0,0,0,0,0,0,0,31.0,all marketing,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1
4,2019-11-14,2019-12-21,1,0,37,0,53,0,1,0,1,0,0,0,3,0,26,0,26,0,0,0,139515348,0,139515348,0,0,0,0,0,0,0,0,24.0,all marketing,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1


In [58]:
marketing_slice = engmt[engmt['marketing_holdout'] == 'all marketing']
holdout_slice = engmt[engmt['marketing_holdout'] == 'no marketing']

In [59]:
test_columns = [
    'is_entitled',
    'ttl_login_days_l7', 'ttl_login_days_itd', 'ttl_streams_l7',
       'ttl_streams_itd', 'streaming_profiles_l7', 'streaming_profiles_itd',
       'general_streaming_profiles_l7', 'general_streaming_profiles_itd',
       'kids_streaming_profiles_l7', 'kids_streaming_profiles_itd',
       'streaming_devices_l7', 'streaming_devices_itd', 'ttl_stream_days_l7',
       'ttl_stream_days_itd', 'ttl_stream_days_general_profiles_l7',
       'ttl_stream_days_general_profiles_itd',
       'ttl_stream_days_kids_profiles_l7', 'ttl_stream_days_kids_profiles_itd',
       'ttl_stream_time_ms_l7', 'ttl_stream_time_ms_itd',
       'ttl_stream_time_general_profiles_ms_l7',
       'ttl_stream_time_general_profiles_ms_itd',
       'ttl_stream_time_kids_profiles_ms_l7',
       'ttl_stream_time_kids_profiles_ms_itd', 'ttl_stream_time_web_ms_l7',
       'ttl_stream_time_web_ms_itd', 'ttl_stream_time_mobile_ms_l7',
       'ttl_stream_time_mobile_ms_itd', 'ttl_stream_time_connected_tv_ms_l7',
       'tl_stream_time_connected_tv_ms_itd', 'days_since_last_stream',
       'ttl_login_days_l7_bin', 'ttl_login_days_itd_bin',
       'ttl_streams_l7_bin', 'ttl_streams_itd_bin',
       'general_streaming_profiles_l7_bin',
       'general_streaming_profiles_itd_bin', 'kids_streaming_profiles_l7_bin',
       'kids_streaming_profiles_itd_bin', 'ttl_stream_time_web_ms_l7_bin',
       'ttl_stream_time_web_ms_itd_bin', 'ttl_stream_time_mobile_ms_l7_bin',
       'ttl_stream_time_mobile_ms_itd_bin',
       'ttl_stream_time_connected_tv_ms_l7_bin',
       'tl_stream_time_connected_tv_ms_itd_bin', 'days_since_last_stream_bin'
]

In [60]:
p_values = {}

for i in test_columns:
    try: 
        p1 = stats.ttest_ind(marketing_slice[i], holdout_slice[i])[1]
        descr1 = marketing_slice[i]
    except (TypeError,RuntimeWarning): 
        p1 = "Broke!"
    p_values[i] = p1
    print("Completed",i)


Completed is_entitled
Completed ttl_login_days_l7
Completed ttl_login_days_itd
Completed ttl_streams_l7
Completed ttl_streams_itd
Completed streaming_profiles_l7
Completed streaming_profiles_itd
Completed general_streaming_profiles_l7
Completed general_streaming_profiles_itd
Completed kids_streaming_profiles_l7
Completed kids_streaming_profiles_itd
Completed streaming_devices_l7
Completed streaming_devices_itd
Completed ttl_stream_days_l7
Completed ttl_stream_days_itd
Completed ttl_stream_days_general_profiles_l7
Completed ttl_stream_days_general_profiles_itd
Completed ttl_stream_days_kids_profiles_l7
Completed ttl_stream_days_kids_profiles_itd
Completed ttl_stream_time_ms_l7
Completed ttl_stream_time_ms_itd
Completed ttl_stream_time_general_profiles_ms_l7
Completed ttl_stream_time_general_profiles_ms_itd
Completed ttl_stream_time_kids_profiles_ms_l7
Completed ttl_stream_time_kids_profiles_ms_itd
Completed ttl_stream_time_web_ms_l7
Completed ttl_stream_time_web_ms_itd
Completed ttl_str

In [61]:
p_values

{'is_entitled': 0.1849802367507216,
 'ttl_login_days_l7': 0.04665624317807763,
 'ttl_login_days_itd': 0.0038449202277494963,
 'ttl_streams_l7': 0.7171487965974768,
 'ttl_streams_itd': 0.2698818484277066,
 'streaming_profiles_l7': 0.9332977884944804,
 'streaming_profiles_itd': 0.7948405694755132,
 'general_streaming_profiles_l7': 0.9771744277758684,
 'general_streaming_profiles_itd': 0.7989146915543986,
 'kids_streaming_profiles_l7': 0.8590511173425637,
 'kids_streaming_profiles_itd': 0.8979909070525216,
 'streaming_devices_l7': 0.42870778369947105,
 'streaming_devices_itd': 0.27961712137004185,
 'ttl_stream_days_l7': 0.05833017425080381,
 'ttl_stream_days_itd': 0.030300735467008212,
 'ttl_stream_days_general_profiles_l7': 0.4277799436835724,
 'ttl_stream_days_general_profiles_itd': 0.20466223601862868,
 'ttl_stream_days_kids_profiles_l7': 0.2439570636867984,
 'ttl_stream_days_kids_profiles_itd': 0.4582676193619514,
 'ttl_stream_time_ms_l7': 0.4103573216552181,
 'ttl_stream_time_ms_itd'