In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
#allows pretty print of dataframes
from IPython.display import display


#1) Reading in csv files with pandas dataframe

#a) Reading in user data
users = pd.read_csv("takehome_users.csv", sep=",", engine='python')
display(users.head())

print(users.info())

#b) Reading in user engagement data
user_engagement = pd.read_csv("takehome_user_engagement.csv")
display(user_engagement.head(5))




Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 10 columns):
object_id                     12000 non-null int64
creation_time                 12000 non-null object
name                          12000 non-null object
email                         12000 non-null object
creation_source               12000 non-null object
last_session_creation_time    8823 non-null float64
opted_in_to_mailing_list      12000 non-null int64
enabled_for_marketing_drip    12000 non-null int64
org_id                        12000 non-null int64
invited_by_user_id            6417 non-null float64
dtypes: float64(2), int64(4), object(4)
memory usage: 937.6+ KB
None


Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


In [64]:
#2) Using 'User_engagement' data to find who is an "adopted user"
# An "adopted user" is a user who has logged into the product on three seperate day sin at least one seven-day period


#a) Converting 'time_stamp' column to datetimes
user_engagement["time_stamp"] = pd.to_datetime(user_engagement["time_stamp"])
# print(user_engagement.info())

#b) setting 'time_stamp' to date time index
user_engagement = user_engagement.set_index("time_stamp")
# print("User engagement df with date time index:\n\n", user_engagement.head())

#c)Creating dataframe to count number of times each user accessed website within 1 week period
user_engagement_weekly_count = user_engagement.groupby(['user_id', pd.TimeGrouper(freq='7D')]).sum()

# print("User engaement df of weekly count:\n\n ")
# display(user_engagement_weekly_count.head(20))

#d) Filtering df to get users with 3 or more logins in a week -- these are the "adopted users"
user_engagement_weekly_count_fltr = user_engagement_weekly_count.loc[user_engagement_weekly_count["visited"]>=3, :]
# print("DF with only users that have accessed website 3 or more times w/i 1 week:\n\n")
# display(user_engagement_weekly_count_fltr)

#e) Extracting unique list of "adopted users"
user_engagement_weekly_count_fltr = user_engagement_weekly_count_fltr.reset_index()

adopted_users = user_engagement_weekly_count_fltr["user_id"].unique()
# print("This is a list of 'adopted users':\n\n", adopted_users)

#f) Making a dataframe containing all adopted users
adopted_users_df = pd.DataFrame({"user_id":adopted_users, "user_adoption": 'yes'})
print("This is a df of adopted users:")
display(adopted_users_df.head())


This is a df of adopted users:


Unnamed: 0,user_adoption,user_id
0,yes,2
1,yes,10
2,yes,42
3,yes,43
4,yes,53


In [73]:
#3) Joining "adopted users" with "user" dataframe
#a) Performing 'outer join'
combined_df = pd.merge(users,adopted_users_df, left_on ='object_id', right_on = "user_id", how='outer')

#b) Filling nan's in 'user_adoption' column with 'no'
combined_df["user_adoption"].fillna("no", inplace=True)


#c) dropping 'user_id' column
combined_df = combined_df.drop("user_id", axis=1)
display(combined_df.head())


Unnamed: 0,object_id,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,user_adoption
0,1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,no
1,2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,yes
2,3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,no
3,4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,no
4,5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,no


In [82]:
#4) Determining most important features to predict if a user will be an 'adopted user' or not

#a) Dropping unnecessary columns
combined_df_model = combined_df.drop(["creation_time", 'name', 'email'], axis=1) #email & names could be 

#b) dropping all rows with nans
combined_df_model_d= combined_df_model.dropna()
# display(combined_df_model_d)

#c) dummifying categorical columns
combined_df_model_w_dum = pd.get_dummies(combined_df_model_d, drop_first=True)
display(combined_df_model_w_dum)


Unnamed: 0,object_id,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,creation_source_ORG_INVITE,user_adoption_yes
0,1,1.398139e+09,1,0,11,10803.0,0,0
1,2,1.396238e+09,0,0,1,316.0,1,1
2,3,1.363735e+09,0,0,94,1525.0,1,0
3,4,1.369210e+09,0,0,1,5151.0,0,0
4,5,1.358850e+09,0,0,193,5240.0,0,0
5,6,1.387424e+09,0,0,197,11241.0,0,0
9,10,1.401833e+09,1,1,318,4143.0,1,1
12,13,1.396196e+09,0,0,254,11204.0,1,0
16,17,1.397314e+09,1,0,175,1600.0,0,0
21,22,1.392012e+09,0,0,7,2994.0,1,0


In [89]:
#5) Feature Selection with Recursive Feature Elimination (RFE)

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


#a) Selectinf features and target variable
X= combined_df_model_w_dum.drop("user_adoption_yes", axis=1)
Y = combined_df_model_w_dum["user_adoption_yes"]

#b) Instantiating model & use RFE
model_logreg = LogisticRegression()
#RFE
rfe = RFE(model_logreg, 3)#calling RFE with model and number of best features we would like to keep

rfe = rfe.fit(X, Y) #fitting rfe with features and target variable

features_selected = rfe.support_ #indicates whether feature is selected or not (True or False)
features_selected_rank = rfe.ranking_ #gives ranking of feature (lower is better)


#Finding column names of features selected
orig_columns = X.columns.values #orignal column names in list form
#print(orig_columns)

selected_column_names = np.array(orig_columns) * features_selected

#cleaning up list of selected column names, removing empty strings
final_selected_column_names = [x for x in selected_column_names if len(x)>1]
print("These are the Features that have been selected via RFE:\n\n", final_selected_column_names)


These are the Features that have been selected via RFE:

 ['object_id', 'last_session_creation_time', 'invited_by_user_id']


# Conclusion

It was determined that 'object_id', 'last_session_creation_time', 'invited_by_user_id' are the three most important features for prediciting wheter a user will be an'adopted user' or not. One recommendation that can be gained from looking at these features is for Relax to create an incentive referal program to increase the potential of gaining an 'adopter user'('object_id' & 'invited_by_user are top features). Futher analaysis is warranted where features are engineered to include first names, and email domains or even collecting additional data about users.  
