In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [176]:
user_engagement = pd.read_csv('takehome_user_engagement.csv', parse_dates=['time_stamp'], index_col='time_stamp')
user_engagement.head()

Unnamed: 0_level_0,user_id,visited
time_stamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-04-22 03:53:30,1,1
2013-11-15 03:45:04,2,1
2013-11-29 03:45:04,2,1
2013-12-09 03:45:04,2,1
2013-12-25 03:45:04,2,1


In [177]:
user_engagement.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 207917 entries, 2014-04-22 03:53:30 to 2014-01-26 08:57:12
Data columns (total 2 columns):
user_id    207917 non-null int64
visited    207917 non-null int64
dtypes: int64(2)
memory usage: 4.8 MB


In [178]:
user_engagement.describe()

Unnamed: 0,user_id,visited
count,207917.0,207917.0
mean,5913.314197,1.0
std,3394.941674,0.0
min,1.0,1.0
25%,3087.0,1.0
50%,5682.0,1.0
75%,8944.0,1.0
max,12000.0,1.0


In [179]:
# The earliest time_stamp
user_engagement.index.min()

Timestamp('2012-05-31 08:20:06')

In [180]:
# The latest time_stamp
user_engagement.index.max()

Timestamp('2014-06-06 14:58:50')

In [181]:
# to count the # of null values
user_engagement.isnull().sum()

user_id    0
visited    0
dtype: int64

In [182]:
# to count the number of unique users
unique_user = user_engagement['user_id'].unique()
unique_user

array([    1,     2,     3, ..., 11998, 11999, 12000])

"Adopted user" as a user who has logged into the product on three separate days in at least one seven-day period.

In [183]:
adopted = []

for i in unique_user:
    id_filter = user_engagement['user_id'] == i
    df_filter = user_engagement[id_filter].resample('1D').count()
    df_filter = df_filter.rolling(window=7).sum()
    df_filter = df_filter.dropna()
    adopted.append(any(df_filter['visited'].values >= 3) * 1)

In [187]:
adoption = list(zip(unique_user, adopted))

df_adopt = pd.DataFrame(adoption)
df_adopt.columns = ['object_id', 'adopted_user']
df_adopt.head()

Unnamed: 0,object_id,adopted_user
0,1,0
1,2,1
2,3,0
3,4,0
4,5,0


In [188]:
user = pd.read_csv('takehome_users.csv', sep=',', encoding='latin-1')
user.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [189]:
user['creation_time'] = pd.to_datetime(user['creation_time'])
user['last_session_creation_time'] = pd.to_datetime(user['last_session_creation_time'])
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null datetime64[ns]
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: datetime64[ns](2), float64(1), int64(4), object(3)
memory usage: 937.6+ KB


In [190]:
user.describe()

Unnamed: 0,object_id,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
count,12000.0,12000.0,12000.0,12000.0,6417.0
mean,6000.5,0.2495,0.149333,141.884583,5962.957145
std,3464.24595,0.432742,0.356432,124.056723,3383.761968
min,1.0,0.0,0.0,0.0,3.0
25%,3000.75,0.0,0.0,29.0,3058.0
50%,6000.5,0.0,0.0,108.0,5954.0
75%,9000.25,0.0,0.0,238.25,8817.0
max,12000.0,1.0,1.0,416.0,11999.0


In [191]:
#The number of null values in the 2 columns
user.isnull().sum()

object_id                        0
creation_time                    0
name                             0
email                            0
creation_source                  0
last_session_creation_time    3177
opted_in_to_mailing_list         0
enabled_for_marketing_drip       0
org_id                           0
invited_by_user_id            5583
dtype: int64

In [192]:
print("The percentage of null values in 'last_session_creation_time' column:",5583/12000)
print("The percentage of null values in 'invited_by_user_id' column:",3177/12000)

The percentage of null values in 'last_session_creation_time' column: 0.46525
The percentage of null values in 'invited_by_user_id' column: 0.26475


These 2 columns have many null values. We will drop the two columns for our analysis.

In [193]:
# drop the 2 columns
user.drop(['last_session_creation_time', 'invited_by_user_id'], axis=1, inplace=True)
user.head(3)

Unnamed: 0,object_id,creation_time,name,email,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1,0,11
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,0,0,1
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,0,0,94


### The chart shows a steady increase of visits during the 2 years.

In [194]:
# merge the 2 datasets
df = user.merge(df_adopt, on='object_id', how='outer')
df.head()

Unnamed: 0,object_id,creation_time,name,email,creation_source,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,adopted_user
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1,0,11,0.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,0,0,1,1.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,0,0,94,0.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,0,0,1,0.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,0,0,193,0.0


In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Data columns (total 9 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
adopted_user                  8823 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(4), object(3)
memory usage: 937.5+ KB


In [196]:
df['adopted_user'] = df.adopted_user.fillna(0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12000 entries, 0 to 11999
Data columns (total 9 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null datetime64[ns]
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
adopted_user                  12000 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(4), object(3)
memory usage: 937.5+ KB


## We use logistic regression to model the data

In [197]:
# identify features and target
X = df[['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'org_id',]]
y = df.adopted_user

In [198]:
# to change categorical data into numerical data
X = pd.get_dummies(X, columns = ['creation_source', 'org_id'], drop_first=True)

In [199]:
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# split the dataset into training and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# create and fit logistic regression model
clf = LogisticRegression()
clf.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [200]:
# validate with test data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print("The logisic regression model has an accuracy of {}".format(round(accuracy,3)))

The logisic regression model has an accuracy of 0.871


In [201]:
# determine which features are the most important
i = 0
features = []
for feature in X.columns.values:
    features.append((clf.coef_[0][i], feature))
    i += 1
#print the top 10
sorted(features, reverse=True)[:10]

[(1.7332237439601519, 'org_id_387'),
 (1.357577822207927, 'org_id_291'),
 (1.3404888525000007, 'org_id_235'),
 (1.338642118356902, 'org_id_366'),
 (1.2257030501441784, 'org_id_392'),
 (1.1802757991802297, 'org_id_62'),
 (1.1796517434651732, 'org_id_82'),
 (1.1430090160504398, 'org_id_117'),
 (1.0937458174131225, 'org_id_415'),
 (1.0747985264446596, 'org_id_161')]

# Based on the values above, we observed that the organization the users belong to is the most important feature in predicting user adoption.