In [619]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [620]:
engagement = pd.read_csv("takehome_user_engagement.csv")

In [621]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   time_stamp  207917 non-null  object
 1   user_id     207917 non-null  int64 
 2   visited     207917 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 4.8+ MB


In [622]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [623]:
takehome_users = pd.read_csv("takehome_users.csv", encoding='latin-1') # This data was not utf-8 formatted, so I had to specify the encoding keyargument to 'latin-1'

In [624]:
takehome_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   object_id                   12000 non-null  int64  
 1   creation_time               12000 non-null  object 
 2   name                        12000 non-null  object 
 3   email                       12000 non-null  object 
 4   creation_source             12000 non-null  object 
 5   last_session_creation_time  8823 non-null   float64
 6   opted_in_to_mailing_list    12000 non-null  int64  
 7   enabled_for_marketing_drip  12000 non-null  int64  
 8   org_id                      12000 non-null  int64  
 9   invited_by_user_id          6417 non-null   float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB


In [625]:
takehome_users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


### To satisfy the requirements for an "Adopted user" the engagement data will be grouped by the day.  Then a count collumn will be added broadcasting the value of 1.  Then a 7 day rolling count will be created.  If the count column is greater than 3, that user is considered "adopted."

In [626]:
engagement.time_stamp = pd.to_datetime(engagement.time_stamp) # Changing the datatype of the timestamp to a datetime object.

In [627]:
engagement.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207917 entries, 0 to 207916
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   time_stamp  207917 non-null  datetime64[ns]
 1   user_id     207917 non-null  int64         
 2   visited     207917 non-null  int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 4.8 MB


In [628]:
engagement.index = pd.to_datetime(engagement.time_stamp)
engagement = engagement.drop('time_stamp',1)
engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


In [629]:
weekly_logins = engagement.groupby('user_id')['visited'].rolling('7D').count() # 7 day rollup grouped by user_id
weekly_logins.head()

user_id  time_stamp         
1        2014-04-22 03:53:30    1.0
2        2013-11-15 03:45:04    1.0
         2013-11-29 03:45:04    1.0
         2013-12-09 03:45:04    1.0
         2013-12-25 03:45:04    1.0
Name: visited, dtype: float64

In [630]:
adopted = weekly_logins.groupby('user_id').max() > 2 # This is to set the second condition for being an adopted user; they have to have visited the site for 3 separate days.
adopted.head()

user_id
1    False
2     True
3    False
4    False
5    False
Name: visited, dtype: bool

In [631]:
adopted = weekly_logins.groupby('user_id').max() > 2
users = takehome_users.join(adopted)
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,visited
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,False
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,True
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [632]:
users.loc[users.visited.isnull(), 'visited'] = False # Users who never visited the site have neigher True/False value because their user_id is not found in the engagement table.  Therefore a value of False is imputed.
users.loc[users.invited_by_user_id.isnull(), 'invited_by_user_id'] = 0 # Imputed a value of 0 for not user_id.
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,visited
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,False
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,True
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


In [633]:
users = users.rename(columns = {'visited':'Target_Variable'})
users.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,Target_Variable
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,False
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,False
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,True
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,False
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,False


I dropped the object id, creation_time, name, e-mail, ord_id, invited_by_user_id and last_session_creation_time and created dummy variables out of the creation source.

After dropping these columns, the Target Variable will be quantized into 0 and 1.  Then separated into a y target variable for LogisticRegression.

In [634]:
users = users.drop(['object_id', 'creation_time', 'email', 'last_session_creation_time', 'org_id', 'invited_by_user_id'], axis = 1)
users.head()

Unnamed: 0,name,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,Target_Variable
0,Clausen August,GUEST_INVITE,1,0,False
1,Poole Matthew,ORG_INVITE,0,0,False
2,Bottrill Mitchell,ORG_INVITE,0,0,True
3,Clausen Nicklas,GUEST_INVITE,0,0,False
4,Raw Grace,GUEST_INVITE,0,0,False


In [635]:
users['Target_Variable'].replace(True, '1', inplace=True)
users['Target_Variable'].replace(False, '0', inplace=True)
users.head()

Unnamed: 0,name,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,Target_Variable
0,Clausen August,GUEST_INVITE,1,0,0
1,Poole Matthew,ORG_INVITE,0,0,0
2,Bottrill Mitchell,ORG_INVITE,0,0,1
3,Clausen Nicklas,GUEST_INVITE,0,0,0
4,Raw Grace,GUEST_INVITE,0,0,0


In [636]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   name                        12000 non-null  object
 1   creation_source             12000 non-null  object
 2   opted_in_to_mailing_list    12000 non-null  int64 
 3   enabled_for_marketing_drip  12000 non-null  int64 
 4   Target_Variable             12000 non-null  object
dtypes: int64(2), object(3)
memory usage: 468.9+ KB


In [637]:
users = users.drop(['name'], axis = 1) # Dropped the name column
users.head()

Unnamed: 0,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,Target_Variable
0,GUEST_INVITE,1,0,0
1,ORG_INVITE,0,0,0
2,ORG_INVITE,0,0,1
3,GUEST_INVITE,0,0,0
4,GUEST_INVITE,0,0,0


In [638]:
y = users.Target_Variable # X and y are separated
X = users.drop(['Target_Variable'], axis = 1)

In [639]:
X = pd.get_dummies(X) # X is fully quantized for Logistic Regression
X.head()

Unnamed: 0,opted_in_to_mailing_list,enabled_for_marketing_drip,creation_source_GUEST_INVITE,creation_source_ORG_INVITE,creation_source_PERSONAL_PROJECTS,creation_source_SIGNUP,creation_source_SIGNUP_GOOGLE_AUTH
0,1,0,1,0,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0
4,0,0,1,0,0,0,0


In [640]:
y.value_counts() # This is the fraction of users considered adopted to those not adopted.

0    10398
1     1602
Name: Target_Variable, dtype: int64

# Modelling A LogisticRegression

In [641]:
from sklearn.model_selection import train_test_split

In [642]:
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size = 0.3) # Splitting up our training data into training/testing data.

In [643]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [644]:
score = accuracy_score(y_test, Pred_y) # Decision tree score higher than the Logisitc Regression Score.
score

0.8680555555555556

In [645]:
Model = LogisticRegression() # This is a Logistic Regression Model
Model.fit(X_train, X_test)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [646]:
pred_y = Model.predict(y_train)
pred_y # As expected, This array represents the predicted target variable from the y_train data information.

array(['0', '0', '0', ..., '0', '0', '0'], dtype=object)

In [647]:
accuracy_score(y_test, pred_y)

0.8680555555555556

The Logistic regression model has about 85% accuracy where there are only true positives and false positives such that the recal was less than 1.  The features that determined the model were creation source and whether or not they opted in for a mailing list and whether or not they enabled for marketing drip.