In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

In [3]:
#read data about all users visits into df_en
df_en=pd.read_csv('takehome_user_engagement.csv')
df_en['time_stamp']=pd.to_datetime(df_en['time_stamp'])
df_en.head(20)

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1
5,2013-12-31 03:45:04,2,1
6,2014-01-08 03:45:04,2,1
7,2014-02-03 03:45:04,2,1
8,2014-02-08 03:45:04,2,1
9,2014-02-09 03:45:04,2,1


In [4]:
#function has_week is calculating engagement of a user in 7 day period
def has_week(series):
    #print(series)
    data = np.sort(series)
    #return len(data)
    prev_prev_day = None
    prev_day = None
    for curr_day in data:
        if prev_prev_day is not None:
            if (curr_day - prev_prev_day)/ np.timedelta64(1, 'D') <= 7:
                #print(curr_day, prev_prev_day)
                return True
        prev_prev_day = prev_day
        prev_day = curr_day
    return False

In [5]:
#find all users who are engaged during a week
df_all = df_en.groupby('user_id')['time_stamp'].agg([['has_week', has_week]])
df_all.head()

Unnamed: 0_level_0,has_week
user_id,Unnamed: 1_level_1
1,False
2,True
3,False
4,False
5,False


In [6]:
df_true = df_all[df_all['has_week']==True]
df_true.reset_index(level=0, inplace=True)
#df_true
df_true['user_id'].values
#print(df_true.columns)

array([    2,    10,    20, ..., 11969, 11975, 11988], dtype=int64)

In [7]:
#read all users data into df_users
df_users=pd.read_csv('takehome_users.csv', encoding='latin-1')
df_users.tail()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
11995,11996,2013-09-06 06:14:15,Meier Sophia,SophiaMeier@gustr.com,ORG_INVITE,1378448000.0,0,0,89,8263.0
11996,11997,2013-01-10 18:28:37,Fisher Amelie,AmelieFisher@gmail.com,SIGNUP_GOOGLE_AUTH,1358275000.0,0,0,200,
11997,11998,2014-04-27 12:45:16,Haynes Jake,JakeHaynes@cuvox.de,GUEST_INVITE,1398603000.0,1,1,83,8074.0
11998,11999,2012-05-31 11:55:59,Faber Annett,mhaerzxp@iuxiw.com,PERSONAL_PROJECTS,1338638000.0,0,0,6,
11999,12000,2014-01-26 08:57:12,Lima Thaís,ThaisMeloLima@hotmail.com,SIGNUP,1390727000.0,0,1,0,


In [8]:
#creat new column 'adopted' 
df_users['adopted'] = 0 

In [9]:
#fill 'adopted' with the data calculated above
for entry in df_users['object_id']:
    if entry in df_true['user_id'].values:
        df_users.loc[df_users['object_id']==entry, 'adopted']=1

In [10]:
df_users.columns

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id',
       'adopted'],
      dtype='object')

In [11]:
df_users[['creation_time','last_session_creation_time']]

Unnamed: 0,creation_time,last_session_creation_time
0,2014-04-22 03:53:30,1.398139e+09
1,2013-11-15 03:45:04,1.396238e+09
2,2013-03-19 23:14:52,1.363735e+09
3,2013-05-21 08:09:28,1.369210e+09
4,2013-01-17 10:14:20,1.358850e+09
5,2013-12-17 03:37:06,1.387424e+09
6,2012-12-16 13:24:32,1.356010e+09
7,2013-07-31 05:34:02,
8,2013-11-05 04:04:24,
9,2013-01-16 22:08:03,1.401833e+09


In [12]:
#convert 'creation_time' and 'last_session_creation_time' to datetime
import datetime
# set creation_time
df_users['creation_time'] = pd.to_datetime(df_users['creation_time'])
#last_session_creation_time is a unix time stamp, so use map to convert to datetime
df_users['last_session_creation_time'] = pd.to_datetime(df_users['last_session_creation_time'],unit='s')

In [13]:
df_users[['creation_time','last_session_creation_time']]

Unnamed: 0,creation_time,last_session_creation_time
0,2014-04-22 03:53:30,2014-04-22 03:53:30
1,2013-11-15 03:45:04,2014-03-31 03:45:04
2,2013-03-19 23:14:52,2013-03-19 23:14:52
3,2013-05-21 08:09:28,2013-05-22 08:09:28
4,2013-01-17 10:14:20,2013-01-22 10:14:20
5,2013-12-17 03:37:06,2013-12-19 03:37:06
6,2012-12-16 13:24:32,2012-12-20 13:24:32
7,2013-07-31 05:34:02,NaT
8,2013-11-05 04:04:24,NaT
9,2013-01-16 22:08:03,2014-06-03 22:08:03


In [14]:
#creating new feature, usage_length, which indicates how long a user has been active
df_users['usage_length'] = (df_users['last_session_creation_time'] - df_users['creation_time']).map(lambda x: x.total_seconds())

In [15]:
#creat dummies for 'creation_source' 

In [16]:
creation_source_dummies=pd.get_dummies(df_users['creation_source'])

In [17]:
df_new = pd.concat([df_users,creation_source_dummies], axis=1)

In [18]:
df_new.columns

Index(['object_id', 'creation_time', 'name', 'email', 'creation_source',
       'last_session_creation_time', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'adopted',
       'usage_length', 'GUEST_INVITE', 'ORG_INVITE', 'PERSONAL_PROJECTS',
       'SIGNUP', 'SIGNUP_GOOGLE_AUTH'],
      dtype='object')

In [19]:
df_new.loc[df_new['invited_by_user_id'].isnull(),'invited_by_user_id'] = 0

In [20]:
df_new.loc[df_new['invited_by_user_id']!=0,'invited_by_user_id'] = 1

In [21]:
b = df_new.dropna()

In [22]:
b.dtypes

object_id                              int64
creation_time                 datetime64[ns]
name                                  object
email                                 object
creation_source                       object
last_session_creation_time    datetime64[ns]
opted_in_to_mailing_list               int64
enabled_for_marketing_drip             int64
org_id                                 int64
invited_by_user_id                   float64
adopted                                int64
usage_length                         float64
GUEST_INVITE                           uint8
ORG_INVITE                             uint8
PERSONAL_PROJECTS                      uint8
SIGNUP                                 uint8
SIGNUP_GOOGLE_AUTH                     uint8
dtype: object

In [23]:
c=b.drop(['creation_time','name', 'email','last_session_creation_time', 'creation_source'], axis=1)

In [24]:
d = c.set_index('object_id')

In [25]:
d

Unnamed: 0_level_0,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted,usage_length,GUEST_INVITE,ORG_INVITE,PERSONAL_PROJECTS,SIGNUP,SIGNUP_GOOGLE_AUTH
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1,0,11,1.0,0,0.0,1,0,0,0,0
2,0,0,1,1.0,1,11750400.0,0,1,0,0,0
3,0,0,94,1.0,0,0.0,0,1,0,0,0
4,0,0,1,1.0,0,86400.0,1,0,0,0,0
5,0,0,193,1.0,0,432000.0,1,0,0,0,0
6,0,0,197,1.0,0,172800.0,1,0,0,0,0
7,0,1,37,0.0,0,345600.0,0,0,0,1,0
10,1,1,318,1.0,1,43459200.0,0,1,0,0,0
11,0,0,69,0.0,0,86400.0,0,0,0,1,0
13,0,0,254,1.0,0,0.0,0,1,0,0,0


In [26]:
y=d.pop('adopted')

In [27]:
X=d

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

In [29]:
# KNeighborsClassifier

param_grid = {'n_neighbors': np.arange(1,50), 'p':[2, 3]}

knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=5)

knn_cv.fit(X_train, y_train)

# Predict the labels of the test data: y_pred
y_pred = knn_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1407   27]
 [  45  286]]
             precision    recall  f1-score   support

          0       0.97      0.98      0.98      1434
          1       0.91      0.86      0.89       331

avg / total       0.96      0.96      0.96      1765



In [30]:
# DecisionTreeClassifier

param_grid = {'max_depth':[3,5,8,15,25,30,None],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'min_samples_leaf': [1,2,5,10],
              'min_samples_split': [2,5,10,15,100],
              'criterion': ['gini', 'entropy']}

tree = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree, param_grid, cv=5)

tree_cv.fit(X_train, y_train)
# Predict the labels of the test data: y_pred
y_pred = tree_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[1398   36]
 [  42  289]]
             precision    recall  f1-score   support

          0       0.97      0.97      0.97      1434
          1       0.89      0.87      0.88       331

avg / total       0.96      0.96      0.96      1765



In [31]:
print(tree_cv.best_params_)

{'criterion': 'entropy', 'max_depth': 5, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [32]:
tree_clf=DecisionTreeClassifier(criterion='entropy', max_depth= 5, max_features = None, min_samples_leaf=10, min_samples_split= 2)
tree_clf.fit(X_train, y_train)
fi = pd.DataFrame(list(zip(X.columns, tree_clf.feature_importances_)), columns = ['feature', 'importance'])

In [33]:
fi.sort_values('importance',ascending=False)

Unnamed: 0,feature,importance
4,usage_length,0.987779
2,org_id,0.009469
9,SIGNUP_GOOGLE_AUTH,0.000832
3,invited_by_user_id,0.000781
7,PERSONAL_PROJECTS,0.00059
0,opted_in_to_mailing_list,0.00055
1,enabled_for_marketing_drip,0.0
5,GUEST_INVITE,0.0
6,ORG_INVITE,0.0
8,SIGNUP,0.0


In [34]:
# RandomForestClassifier

param_grid = {"max_depth": [5,8,15,25,30,None], 
              "min_samples_leaf": [1,2,5,10],
              "min_samples_split": [2,5,10,15,100],
              'max_features': ['auto', 'sqrt', 'log2', None],
              "n_estimators": [10,20,100],
              'criterion': ['gini', 'entropy']}
rfc = RandomForestClassifier()
rfc_cv = GridSearchCV(rfc, param_grid, cv=5)

# Fit the classifier to the training data
rfc_cv.fit(X_train, y_train)
# Predict the labels of the test data: y_pred
y_pred = rfc_cv.predict(X_test)

# Generate the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1404   30]
 [  44  287]]
             precision    recall  f1-score   support

          0       0.97      0.98      0.97      1434
          1       0.91      0.87      0.89       331

avg / total       0.96      0.96      0.96      1765



In [35]:
# LogisticRegression

param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100,1000],
              'penalty': ['l1', 'l2']}

# Create the classifier: logreg
logreg = LogisticRegression()

logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit the classifier to the training data
logreg_cv.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = logreg_cv.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
#print(logreg_cv.feature_importances_)

[[1410   24]
 [  44  287]]
             precision    recall  f1-score   support

          0       0.97      0.98      0.98      1434
          1       0.92      0.87      0.89       331

avg / total       0.96      0.96      0.96      1765



As we can see all the models above perform about the same good. The most important feature is 'usage_lenght'. Based on this, I would recommend encourage existing users to log in and use their accounts.
The next important feature is how user signed up. Google Authentication, guest inviation and personal projects have the highest user adoption rate. Encouraging users to use Google Authentication, invite others, and offer people to work on their personal projects might be effective for future user adoption.