In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import os

In [3]:
os.listdir()

['.ipynb_checkpoints',
 'relax_data_science_challenge.pdf',
 'takehome_challenge2.ipynb',
 'takehome_users.csv',
 'takehome_user_engagement.csv']

In [5]:
users = pd.read_csv('takehome_users.csv', encoding='ANSI')
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [25]:
logins = pd.read_csv('takehome_user_engagement.csv', encoding='ANSI')
logins.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [26]:
logins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [31]:
# Use logins to determine which users are "adopted users" 
# i.e. - which have logged in 3 separate days in a 7 day period
logins['time_stamp'] = pd.to_datetime(logins['time_stamp'])
logins['time_stamp'] = logins['time_stamp'].dt.normalize()

logins.set_index('time_stamp', inplace=True, drop=True)
logins.drop('visited',axis=1, inplace=True)

In [53]:
def user_adoption(user_id):
    '''Takes in a user_id and returns a 1 if user has logged in on 3 separate days within 
    any given 7 day period. returns 0 if not'''
    
    # first check if 3 or more logins 
    if len(logins[logins['user_id']==user_id]) < 3:
        return 0
    
    # if passes that check, then look at rolling 7 day windows
    if logins[logins['user_id']==user_id].rolling('7d').count().max().values[0] >= 3:
        return 1
           
    else:
        return 0

In [64]:
from tqdm import tqdm

adopted_users = []
number = []

for user in tqdm(logins.user_id.unique()):
    adopted_users.append(user_adoption(user))
    number.append(user)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8823/8823 [00:26<00:00, 328.80it/s]


In [69]:
#len(adopted_users)
series = pd.Series(data=adopted_users, index=number, name='adopted_user')
series.head()

1    0
2    1
3    0
4    0
5    0
Name: adopted_user, dtype: int64

In [70]:
# Merge adopted users (df) back into users
users = pd.merge(users, series.to_frame(), left_on='object_id', right_index=True)
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0


In [74]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   8823 non-null   int64  
 1   creation_time               8823 non-null   object 
 2   name                        8823 non-null   object 
 3   email                       8823 non-null   object 
 4   creation_source             8823 non-null   object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    8823 non-null   int64  
 7   enabled_for_marketing_drip  8823 non-null   int64  
 8   org_id                      8823 non-null   int64  
 9   invited_by_user_id          4776 non-null   float64
 10  adopted_user                8823 non-null   int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 827.2+ KB


In [77]:
# to drop: creation time, name, email, last session creation time
#users.drop(['creation_time','name','email','last_session_creation_time'], axis=1, inplace=True)

# categorical: creation source, opted in to mailing list, enabled for marketing drip, org_id
for col in ['creation_source', 'opted_in_to_mailing_list','enabled_for_marketing_drip', 'org_id']:
    users[col] = users[col].astype('category')
    
# create - invited by a user: yes/no (as a category)
users['invited'] = ['no' if x == np.nan else 'yes' for x in users['invited_by_user_id']]

In [80]:
users['invited'] = users['invited'].astype('category')

In [82]:
users.drop('invited_by_user_id', axis=1, inplace=True)

In [83]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8823 entries, 0 to 11999
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   object_id                   8823 non-null   int64   
 1   creation_source             8823 non-null   category
 2   opted_in_to_mailing_list    8823 non-null   category
 3   enabled_for_marketing_drip  8823 non-null   category
 4   org_id                      8823 non-null   category
 5   adopted_user                8823 non-null   int64   
 6   invited                     8823 non-null   category
dtypes: category(5), int64(2)
memory usage: 282.2 KB


### Preprocessing & Modelling

In [86]:
users.org_id.value_counts()

0      228
1      172
2      150
3      125
4      122
      ... 
396      6
400      6
397      5
386      4
416      2
Name: org_id, Length: 417, dtype: int64

In [87]:
# for first go round, will drop org_id because simply so many, factoring that in would be a 
# future idea
users.drop('org_id', axis=1, inplace=True)

In [88]:
data = pd.get_dummies(users)

In [92]:
data.head()

Unnamed: 0,object_id,adopted_user,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH,opted_in_to_mailing_list_0,opted_in_to_mailing_list_1,enabled_for_marketing_drip_0,enabled_for_marketing_drip_1,invited_yes
0,1,0,1,0,0,0,0,0,1,1,0,1
1,2,1,0,1,0,0,0,1,0,1,0,1
2,3,0,0,1,0,0,0,1,0,1,0,1
3,4,0,1,0,0,0,0,1,0,1,0,1
4,5,0,1,0,0,0,0,1,0,1,0,1


In [101]:
from sklearn.preprocessing import StandardScaler

X = data.drop(['adopted_user','object_id'], axis=1).values
y = data['adopted_user']

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [103]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression().fit(X_scaled, y)

In [106]:
pd.DataFrame(zip(data.drop(['adopted_user','object_id'],axis=1).columns, 
                 np.transpose(clf.coef_)), columns=['features', 'coef']) 

Unnamed: 0,features,coef
0,creation_source_GUEST_INVITE,[0.10759473207851669]
1,creation_source_ORG_INVITE,[-0.026171106592053513]
2,creation_source_PERSONAL_PROJECTS,[0.05909756546648548]
3,creation_source_SIGNUP,[-0.07955336561527107]
4,creation_source_SIGNUP_GOOGLE_AUTH,[-0.0348959142301648]
5,opted_in_to_mailing_list_0,[-0.009381564535978597]
6,opted_in_to_mailing_list_1,[0.009381564535978597]
7,enabled_for_marketing_drip_0,[0.0003954107084390705]
8,enabled_for_marketing_drip_1,[-0.00039541070843943484]
9,invited_yes,[0.0]


### Summary of Findings

The biggest factor I found that predicts user adoption is the creation source. Specifically, it appears that users signed up via Guest Invite and Personal Projects referrals are the most likely to become adopted users.

One other factor that I did not fully dive into in this analysis was looking at adoption rates across the individual organizations. This would make an excellent step for further analysis. If we can identify specific organizations with higher/lower than normal adoption rates we may be able to better understand how to leverage insights from the higher adoption organizations and share those best practices with other groups to drive further adoption.