In [98]:
import pandas as pd

In [99]:
user_df = pd.read_csv('takehome_users.csv', encoding='latin-1')
user_df_saved = user_df.copy()
engagement_df = pd.read_csv('takehome_user_engagement.csv')

In [100]:
# Convert time_stamp to datetime
engagement_df['time_stamp'] = pd.to_datetime(engagement_df['time_stamp'])



In [101]:
# creates a list of users who have logged in on three separate days in at least one seven-day period
adopted_users_list = []
for user in engagement_df['user_id'].unique():
    user_df = engagement_df[engagement_df['user_id'] == user]
    user_df = user_df.set_index('time_stamp')
    user_df = user_df.resample('D').count()
    user_df = user_df[user_df['visited'] > 0]
    user_df = user_df.rolling(window=7).sum()
    if user_df['visited'].max() >= 3:
        adopted_users_list.append(user)

In [102]:
# Create a column in user_df to indicate if a user is an adopted user
user_df_saved['adopted_user'] = user_df_saved['object_id'].apply(lambda x: 1 if x in adopted_users_list else 0)

In [103]:
#creates dummy variables for the creation_source column
dummy_df = pd.get_dummies(user_df_saved['creation_source'])
user_df_saved = pd.concat([user_df_saved, dummy_df], axis=1)
user_df_saved.drop('creation_source', axis=1, inplace=True)

In [104]:
#fill in missing values with 0
user_df_saved.fillna(0, inplace=True)

In [105]:
#drop columns that are not needed
user_df_saved.drop(['object_id', 'name', 'email',], axis=1, inplace=True)

In [108]:
#sets date columns to datetime
user_df_saved['creation_time'] = pd.to_datetime(user_df_saved['creation_time'])
user_df_saved['last_session_creation_time'] = pd.to_datetime(user_df_saved['last_session_creation_time'], unit='s')

In [109]:
user_df_saved.corr()['adopted_user'].sort_values(ascending=False)

adopted_user                  1.000000
last_session_creation_time    0.248590
org_id                        0.063737
GUEST_INVITE                  0.043657
SIGNUP_GOOGLE_AUTH            0.034821
invited_by_user_id            0.021602
opted_in_to_mailing_list      0.006780
SIGNUP                        0.006635
enabled_for_marketing_drip    0.005074
ORG_INVITE                   -0.003146
PERSONAL_PROJECTS            -0.075949
creation_time                -0.088143
Name: adopted_user, dtype: float64

In [113]:
#makes the last_session_creation_time and first session column into a unix timestamp
user_df_saved['last_session_creation_time'] = user_df_saved['last_session_creation_time'].astype('int64')
user_df_saved['creation_time'] = user_df_saved['creation_time'].astype('int64')


In [115]:
#find the most important features using a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = user_df_saved.drop('adopted_user', axis=1)
y = user_df_saved['adopted_user']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=34)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

feature_importances = pd.DataFrame(rf.feature_importances_,
                                      index = X_train.columns,
                                        columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances)

print(rf.score(X_test, y_test))

                            importance
last_session_creation_time    0.681473
creation_time                 0.214105
org_id                        0.052932
invited_by_user_id            0.031788
opted_in_to_mailing_list      0.004325
enabled_for_marketing_drip    0.003989
PERSONAL_PROJECTS             0.003604
GUEST_INVITE                  0.002188
ORG_INVITE                    0.002156
SIGNUP_GOOGLE_AUTH            0.001868
SIGNUP                        0.001571
0.9680555555555556
