In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from pandas.io import sql
import snowflake.connector
import keyring
import psycopg2 
import time
from datetime import date, timedelta
from scipy import stats

pd.set_option('display.max_colwidth', 50)
pd.set_option('display.max_columns', 500)

from matplotlib import pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
sns.set_style("darkgrid")

In [2]:
snowflake_username = 'matthew.bessey@disneystreaming.com'

In [3]:
ctx = snowflake.connector.connect(authenticator='externalbrowser', 
                                  user=snowflake_username, 
                                  account='disneystreaming.us-east-1')

Initiating login request with your identity provider. A browser window should have opened for you to complete the login. If you can't see it, check existing browser windows, or your OS settings. Press CTRL+C to abort and try again...


In [4]:
# set date parameters for query

subscription_start_date_min = "'2019-11-12'"
subscription_start_date_max = "'2019-11-13'" # max of subscription start date
engagement_date = "'2019-12-19'" # date for which we want to pull engagement behaviors

In [5]:
query= """
select o.swid
, o.swid_holdout
, a.accountid
, s.subscription_id
, e.*
from subscription s
join account a on s.account_id = a.accountid
join oneid_combined o on a.swid = o.swid
join "DSS_PROD"."DISNEY_PLUS"."DIM_DISNEY_DAILY_ACCOUNT_ENGAGEMENT" e on a.accountid = e.account_id
where s.partner = 'disney'
and s.calculated_subscription_start_dtm >= {}
and s.CALCULATED_SUBSCRIPTION_START_DTM <= {}
--and s.is_entitled = 1
and e.ds = {}
and e.is_pre_launch != 1
limit 100000;
""".format(subscription_start_date_min, subscription_start_date_max, engagement_date)

In [6]:
# run the query and write to engagement
engagement = pd.read_sql(query,ctx)

In [7]:
# map columns to lowercase
engagement.columns = engagement.columns.str.lower()

In [8]:
# create function and apply for mapping of holdout groups on 'swid_holdout'
def holdout_grouping(df):
    if df['swid_holdout'] < 243:
        return "all marketing"
    elif df['swid_holdout'] >= 243 and df['swid_holdout'] < 246:
        return "no onboarding"
    else:
        return "no marketing"
    
engagement['marketing_holdout'] = engagement.apply(holdout_grouping,axis=1)

In [9]:
#drop rows w/ no entitlement data, rewrite as int
engagement = engagement.drop(engagement[engagement['is_entitled']=='unknown'].index,axis=0)
engagement.is_entitled = engagement.is_entitled.astype(int)

In [11]:
# remove non-US countries and then country column
countryUS_filter = engagement['account_home_country'] == 'US'
engagementCleaned = engagement[countryUS_filter]
engagementCleaned = engagementCleaned.drop('account_home_country',axis=1)

In [12]:
# remove unnecessary columns
columnsToRemove = [        
    'swid',
    'swid_holdout',
    'accountid',
    'subscription_id',
    'ds',
    'account_id',
    'is_flagged',
    'first_account_subscription_signup_week',
    'last_account_subscription_signup_week',
    'is_entitled_l1',
    'is_entitled_l7',
    'is_entitled_l28',
    'is_entitled_itd',
    'is_pre_launch',
    'is_pre_launch_nltt',
    'account_home_country',
    'subscription_state_upd',
    'subscription_type',
    'total_login_days_l1',
    'total_login_days_l28',
    'total_streams_l1',
    'total_streams_l28',
    'num_streaming_profiles_l1',
    'num_streaming_profiles_l28',
    'num_general_streaming_profiles_l1',
    'num_general_streaming_profiles_l28',
    'num_kids_streaming_profiles_l1',
    'num_kids_streaming_profiles_l28',
    'num_streaming_devices_l1',
    'num_streaming_devices_l28',
    'account_total_stream_days_l1',
    'account_total_stream_days_l28',
    'account_profile_total_stream_days_l1',
    'account_profile_total_stream_days_l28',
    'total_stream_days_general_profiles_l1',
    'total_stream_days_general_profiles_l28',
    'total_stream_days_kids_profiles_l1',
    'total_stream_days_kids_profiles_l28',
    'total_stream_time_ms_l1',
    'total_stream_time_ms_l28',
    'total_stream_time_general_profiles_ms_l1',
    'total_stream_time_general_profiles_ms_l28',
    'total_stream_time_kids_profiles_ms_l1',
    'total_stream_time_kids_profiles_ms_l28',
    'total_stream_time_web_ms_l1',
    'total_stream_time_web_ms_l28',
    'total_stream_time_mobile_ms_l1',
    'total_stream_time_mobile_ms_l28',
    'total_stream_time_connected_tv_ms_l1',
    'total_stream_time_connected_tv_ms_l28',
    'total_stream_time_unknown_ms_l1',
    'total_stream_time_unknown_ms_l28',
    'last_stream_date',
    'account_profile_total_stream_days_l7',
    'account_profile_total_stream_days_itd',
    'total_stream_time_unknown_ms_l7',
    'total_stream_time_unknown_ms_itd',
]

engagementCleaned = engagement.drop(columnsToRemove,axis=1)

In [13]:
renamedColumns = [
        'first_signup_date',
    'last_signup_date', 
    'is_entitled',
    'ttl_login_days_l7', 
    'ttl_login_days_itd', 
    'ttl_streams_l7',
    'ttl_streams_itd', 
    'streaming_profiles_l7',
    'streaming_profiles_itd', 
    'general_streaming_profiles_l7',
    'general_streaming_profiles_itd', 
    'kids_streaming_profiles_l7',
    'kids_streaming_profiles_itd', 
    'streaming_devices_l7',
    'streaming_devices_itd', 
    'ttl_stream_days_l7',
    'ttl_stream_days_itd',
    'ttl_stream_days_general_profiles_l7',
    'ttl_stream_days_general_profiles_itd',
    'ttl_stream_days_kids_profiles_l7',
    'ttl_stream_days_kids_profiles_itd', 
    'ttl_stream_time_ms_l7',
    'ttl_stream_time_ms_itd', 
    'ttl_stream_time_general_profiles_ms_l7',
    'ttl_stream_time_general_profiles_ms_itd',
    'ttl_stream_time_kids_profiles_ms_l7',
    'ttl_stream_time_kids_profiles_ms_itd', 
    'ttl_stream_time_web_ms_l7',
    'ttl_stream_time_web_ms_itd', 
    'ttl_stream_time_mobile_ms_l7',
    'ttl_stream_time_mobile_ms_itd',
    'ttl_stream_time_connected_tv_ms_l7',
    'tl_stream_time_connected_tv_ms_itd', 
    'days_since_last_stream',
    'marketing_holdout'
]

engagementCleaned.columns = renamedColumns

In [14]:
# rename for conciseness
engmt = engagementCleaned
del engagementCleaned
del engagement

In [16]:
# define columns for binary construction
# binary = 1 if action occurred in interval, else 0

binaryConstructionList = [
    'ttl_login_days_l7',
    'ttl_login_days_itd',
    'ttl_streams_l7',
    'ttl_streams_itd',
    'general_streaming_profiles_l7', 'general_streaming_profiles_itd',
    'kids_streaming_profiles_l7', 'kids_streaming_profiles_itd',
    'ttl_stream_time_web_ms_l7',
    'ttl_stream_time_web_ms_itd', 
    'ttl_stream_time_mobile_ms_l7',
    'ttl_stream_time_mobile_ms_itd', 
    'ttl_stream_time_connected_tv_ms_l7',
    'tl_stream_time_connected_tv_ms_itd', 
    'days_since_last_stream'
]

In [17]:
# create binary variables _bin
for i in binaryConstructionList:
    engmt[i + '_bin'] = engmt.apply(lambda df:
                                   1 if df[i] > 0
                                   else 0,
                                   axis=1)

In [18]:
engmt.groupby('marketing_holdout').mean()

Unnamed: 0_level_0,is_entitled,ttl_login_days_l7,ttl_login_days_itd,ttl_streams_l7,ttl_streams_itd,streaming_profiles_l7,streaming_profiles_itd,general_streaming_profiles_l7,general_streaming_profiles_itd,kids_streaming_profiles_l7,kids_streaming_profiles_itd,streaming_devices_l7,streaming_devices_itd,ttl_stream_days_l7,ttl_stream_days_itd,ttl_stream_days_general_profiles_l7,ttl_stream_days_general_profiles_itd,ttl_stream_days_kids_profiles_l7,ttl_stream_days_kids_profiles_itd,ttl_stream_time_ms_l7,ttl_stream_time_ms_itd,ttl_stream_time_general_profiles_ms_l7,ttl_stream_time_general_profiles_ms_itd,ttl_stream_time_kids_profiles_ms_l7,ttl_stream_time_kids_profiles_ms_itd,ttl_stream_time_web_ms_l7,ttl_stream_time_web_ms_itd,ttl_stream_time_mobile_ms_l7,ttl_stream_time_mobile_ms_itd,ttl_stream_time_connected_tv_ms_l7,tl_stream_time_connected_tv_ms_itd,days_since_last_stream,ttl_login_days_l7_bin,ttl_login_days_itd_bin,ttl_streams_l7_bin,ttl_streams_itd_bin,general_streaming_profiles_l7_bin,general_streaming_profiles_itd_bin,kids_streaming_profiles_l7_bin,kids_streaming_profiles_itd_bin,ttl_stream_time_web_ms_l7_bin,ttl_stream_time_web_ms_itd_bin,ttl_stream_time_mobile_ms_l7_bin,ttl_stream_time_mobile_ms_itd_bin,ttl_stream_time_connected_tv_ms_l7_bin,tl_stream_time_connected_tv_ms_itd_bin,days_since_last_stream_bin
marketing_holdout,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1
all marketing,0.902305,4.939052,28.757787,18.321513,128.595381,1.326457,1.947358,1.24858,1.823036,0.07851,0.134936,1.969399,4.119188,3.404008,21.119652,4.013798,25.347244,0.236573,1.426587,31566570.0,226341600.0,29922960.0,216148400.0,1643607.0,10193200.0,0.0,0.0,0.0,0.0,0.0,0.0,4.770257,0.89168,1.0,0.816554,0.98403,0.810556,0.983788,0.067347,0.104884,0.0,0.0,0.0,0.0,0.0,0.0,0.519496
no marketing,0.910774,4.982515,28.90666,18.204937,129.015428,1.321677,1.923374,1.241707,1.799691,0.080483,0.136282,1.993057,4.142453,3.470044,21.338133,4.038313,25.467987,0.235279,1.401903,32455070.0,229582600.0,30761550.0,219304100.0,1693529.0,10278460.0,0.0,0.0,0.0,0.0,0.0,0.0,4.492553,0.903831,1.0,0.830805,0.984058,0.824119,0.9838,0.068141,0.108254,0.0,0.0,0.0,0.0,0.0,0.0,0.514271
no onboarding,0.920034,4.983165,28.919192,18.110269,127.676768,1.338384,1.97138,1.260943,1.853535,0.079125,0.132997,1.967172,4.173401,3.398148,21.09596,3.958754,25.349327,0.254209,1.445286,31629720.0,225849000.0,29625740.0,215119100.0,2003979.0,10729920.0,0.0,0.0,0.0,0.0,0.0,0.0,4.620394,0.89899,1.0,0.819024,0.982323,0.813973,0.982323,0.06734,0.107744,0.0,0.0,0.0,0.0,0.0,0.0,0.505892


In [19]:
(engmt.groupby('marketing_holdout').mean().iloc[0,]
 /engmt.groupby('marketing_holdout').mean().iloc[1,]-1)*100

is_entitled                               -0.929830
ttl_login_days_l7                         -0.872316
ttl_login_days_itd                        -0.515011
ttl_streams_l7                             0.640355
ttl_streams_itd                           -0.325579
streaming_profiles_l7                      0.361701
streaming_profiles_itd                     1.246989
general_streaming_profiles_l7              0.553448
general_streaming_profiles_itd             1.297168
kids_streaming_profiles_l7                -2.452059
kids_streaming_profiles_itd               -0.987323
streaming_devices_l7                      -1.187024
streaming_devices_itd                     -0.561628
ttl_stream_days_l7                        -1.903031
ttl_stream_days_itd                       -1.023902
ttl_stream_days_general_profiles_l7       -0.607059
ttl_stream_days_general_profiles_itd      -0.474096
ttl_stream_days_kids_profiles_l7           0.550120
ttl_stream_days_kids_profiles_itd          1.760742
ttl_stream_t

In [None]:
p_values = []

for i in range(16,len(marketing_slice.columns)):
    try: 
        p1 = stats.ttest_ind(marketing_slice.iloc[:,i],
                        holdout_slice.iloc[:,i])[1]
        descr1 = marketing_slice.columns[i]
    except (TypeError,RuntimeWarning): 
        p1 = "Broke!"
    p_values.append([descr1,p1])
    print("Completed",i)


In [None]:
p_values