In [None]:
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

The purpose of this script is to predict if a user will be a 'adopted user' of a software program developed by Relax Inc., who "makes productivity and project management software that's popular with both individuals and teams".

Adopted User = "a user who has logged into the product on three separate days in at least one seven day period"

The resulting models generated a predictive accuracy of ~98% or ~86% depending on what variables was included in the input dataset. If the total number of times a user logged into the program was included, then the model had a predictive accuracy of ~98%. However, if the that variable was not included, then the model had a accuracy of ~86%. See PDF write-up for conclusions.

The methods used in this are Data Manipulation, Data Wrangling, and Machine Learning with Random Forest using Scikit-Learn/Sklearn. Because of the high performance of the random forest models, no other models were created.

Required Files Before Running:
- takehome_users.csv
- takehome_user_engagement.csv

These must be in the same folder as this script to run correctly. Else, change the filepath in the next cell to direct to these files.

In [None]:
# Loading in the two data sets:
## For the takehome_users_modified.csv file, loading with the the latin-1 codec was necessary as the standard utf-8 was 
## unable to load without errors.

users_df = pd.read_csv('takehome_users.csv', infer_datetime_format= True, encoding= 'latin-1')
logins_df = pd.read_csv('takehome_user_engagement.csv')

In [None]:
users_df.head()

In [None]:
users_df.info()

In [None]:
logins_df.head()

In [None]:
logins_df.info()

In [None]:
# Assigning the label 'adopted user' according to its definition:
# "a user who has logged into the product on three separate days in at least one seven day period"
# I will be interpreting the definition as a user who has logged in for 3 days within a 7 day period.
# A user labeled as an adopted user will be assigned a value of 1 and non-adopted users will be assigned a value of 0

# Data Manipulation - Changing all time values into time data types:
logins_df['time_stamp'] = pd.to_datetime(logins_df['time_stamp'], format='%Y-%m-%d %H:%M:%S')

# Creating a new DataFrame for the total number of logins and the label of the user:
label_df = pd.DataFrame({'USER_ID': range(1, 12001), 'TOTAL_LOGINS':np.NaN, 'LABEL':np.NaN})

# Filling in label_df with logins & labels:
for index, login in label_df.iterrows():
    temp_df = pd.DataFrame() # This will 'clean' the declaration from previous iterations/error prevention
    login_count = 0
    week_difference = datetime.timedelta(days=7)
    label = np.NaN
    
    temp_df = logins_df[logins_df.user_id == login.USER_ID].copy()

    login_count = len(temp_df)
    label_df.loc[index, 'TOTAL_LOGINS'] = login_count

    if login_count < 3:
        label = 0

    else:
        for temp_index, temp_row in temp_df.iterrows():
            start_date = temp_row.time_stamp.date()
            end_date = start_date + week_difference

            second_login = np.NaN
            third_login = np.NaN

            try: # This is necessary to prevent indexing errors when accessing indexes in the next two rows
                second_login = temp_df.loc[temp_index + 1].time_stamp.date() # This indexing works because the rows are 
                third_login = temp_df.loc[temp_index + 2].time_stamp.date() # already ordered by user ID, then date

                if second_login > start_date and second_login < end_date:
                    if third_login > second_login and third_login <= end_date:
                        label = 1
                        break

            except Exception:
                label = 0 # This means that of all the logins, there were not any three logins made within a week
                break

    label_df.loc[index, 'LABEL'] = label
            

In [None]:
### This is to check to see if there are any np.NaN values left from the initialization of label_df:
label_df.info()

In [None]:
# Creating final DataFrame:
final_df = users_df.copy()

## Merging label_df:
final_df = final_df.merge(label_df, how= 'left', left_on= 'object_id', right_on= 'USER_ID')

## Removing columns object_id, creation_time, name, email, last_session_creation_time, org_id, and USER_ID because they are  
## not relevant to the definition of what is considered to be an adopted user.
final_df = final_df.drop(['object_id', 
                          'creation_time', 
                          'name', 
                          'email', 
                          'last_session_creation_time', 
                          'org_id', 
                          'USER_ID'], 
                         axis= 1)

## Changing creation_source & invited_by_user_id to categorical values:
for index, row in final_df.iterrows():
    
    ###### creation_source categorical assignments:
    ### PERSONAL_PROJECTS = 0
    ### GUEST_INVITE = 1
    ### ORG_INVITE = 2
    ### SIGNUP = 3
    ### SIGNUP_GOOGLE_AUTH = 4
    
    temp_dict = {'PERSONAL_PROJECTS': 0, 'GUEST_INVITE': 1, 'ORG_INVITE': 2, 'SIGNUP': 3, 'SIGNUP_GOOGLE_AUTH': 4}
    
    final_df.loc[index, 'creation_source'] = temp_dict[row.creation_source]
    
    ### If the user was invited by another, then it will be assigned a 1. If the value is a np.NaN, then it will be assigned 
    ### a 0.
    category = np.NaN
    
    if np.isnan(row.invited_by_user_id) == True:
        category = 0
    
    elif row.invited_by_user_id > 0:
        category = 1
        
    final_df.loc[index, 'invited_by_user_id'] = category

In [None]:
# Preparing final_df for machine learning:
##### There will be two different types of data to be feed into the machine learning algorithms; one type will have the 
##### total number of logins a user has made (all variables with "with_total") and the other type will not have this 
##### information (all variables with "no_total").

# Splitting the label from the variables:
variables_with_total = final_df.iloc[:, :5].copy()
labels_with_total = final_df.iloc[:, -1].astype('int32') # .copy() is already default for .astype()

variables_no_total = final_df.iloc[:, :4].copy()
labels_no_total = final_df.iloc[:, -1].astype('int32') # .copy() is already default for .astype()

# Splitting the dataset into train and test sets:
## Note: Shuffling is not necessary since the original user_df is in order of the "object_id"/user ID, which has been removed
## and the assignment of the user ID to the user was not done in order of when the account was created. However, shuffle is
## the default for the train_test_split() function and was keep True in this script.
X_train_with_total, X_test_with_total, Y_train_with_total, Y_test_with_total = train_test_split(variables_with_total, 
                                                                                                labels_with_total, 
                                                                                                test_size=0.25)

X_train_no_total, X_test_no_total, Y_train_no_total, Y_test_no_total = train_test_split(variables_no_total, labels_no_total, 
                                                                                        test_size=0.25)

In [None]:
# Using Scikit-Learn's Random Forest to find Feature Importance:

## Finding the best number of trees to use from the recommended standard of 64-128 trees:
temp_df = pd.DataFrame({'TREES': range(64, 128), 'ACCURACY': np.NaN})

for temp_index, temp_row in temp_df.iterrows():
    trees = int(temp_row.TREES)
    random_forest = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    random_forest.fit(X_train_with_total, Y_train_with_total)
    temp_df.loc[temp_index, 'ACCURACY'] = random_forest.score(X_test_with_total, Y_test_with_total)
    
###### The average is taken here in the case that there are two or more top scoring trees parameters
best_trees = int(np.average(temp_df.TREES[temp_df.ACCURACY == max(temp_df.ACCURACY)]))  

random_forest = RandomForestClassifier(n_estimators= best_trees, max_features= 'sqrt')
random_forest.fit(X_train_with_total, Y_train_with_total)
best_accuracy_with_total = random_forest.score(X_test_with_total, Y_test_with_total)
feature_importance_with_total = random_forest.feature_importances_
feature_importance_df_with_total = pd.DataFrame({'creation_source': feature_importance_with_total[0], 
                                      'opted_in_to_mailing_list': feature_importance_with_total[1],
                                      'enabled_for_marketing_drip': feature_importance_with_total[2], 
                                      'invited_by_user_id': feature_importance_with_total[3], 
                                      'TOTAL_LOGINS': feature_importance_with_total[4]}, 
                                     index=[0])


## Repeating process for the dataset without the total number of logins:
temp_df = pd.DataFrame({'TREES': range(64, 128), 'ACCURACY': np.NaN})

for temp_index, temp_row in temp_df.iterrows():
    trees = int(temp_row.TREES)
    random_forest = RandomForestClassifier(n_estimators= trees, max_features= 'sqrt')
    random_forest.fit(X_train_no_total, Y_train_no_total)
    temp_df.loc[temp_index, 'ACCURACY'] = random_forest.score(X_test_no_total, Y_test_no_total)
    
best_trees = int(np.average(temp_df.TREES[temp_df.ACCURACY == max(temp_df.ACCURACY)]))  

random_forest = RandomForestClassifier(n_estimators= best_trees, max_features= 'sqrt')
random_forest.fit(X_train_no_total, Y_train_no_total)
best_accuracy_no_total = random_forest.score(X_test_no_total, Y_test_no_total)
feature_importance_no_total = random_forest.feature_importances_
feature_importance_df_no_total = pd.DataFrame({'creation_source': feature_importance_no_total[0], 
                                      'opted_in_to_mailing_list': feature_importance_no_total[1],
                                      'enabled_for_marketing_drip': feature_importance_no_total[2], 
                                      'invited_by_user_id': feature_importance_no_total[3]},
                                     index=[0])

feature_importance_df = pd.concat([feature_importance_df_with_total, feature_importance_df_no_total], ignore_index= True)
feature_importance_df.loc[0, 'ACCURACY'] = best_accuracy_with_total
feature_importance_df.loc[1, 'ACCURACY'] = best_accuracy_no_total
feature_importance_df.index = ['WITH TOTAL', 'WITHOUT TOTAL']

In [None]:
######
###### The total of logins a user has seems to be the extremely domininate factor in the random forest trained with the  
###### dataset containing the total number of logins for each user.
######## The creation_source would be the most domininate factor in the model trained without the login counts.
########### The accuracy value is significantly better for the model trained with the total counts. However, the model 
########### trained without the total counts show more insight on the user without login information known prior.
feature_importance_df

In [None]:
# Want to investigate which creation_source category was most common to users labelled as adopted.
examine_df = final_df[final_df.LABEL == 1]
print('PERSONAL_PROJECTS: ', len(examine_df[examine_df.creation_source == 0]) / len(examine_df))
print('GUEST_INVITE: ', len(examine_df[examine_df.creation_source == 1]) / len(examine_df))
print('ORG_INVITE: ', len(examine_df[examine_df.creation_source == 2]) / len(examine_df))
print('SIGNUP: ', len(examine_df[examine_df.creation_source == 3]) / len(examine_df))
print('SIGNUP_GOOGLE_AUTH: ', len(examine_df[examine_df.creation_source == 4]) / len(examine_df))