# Setup

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

We initially read the user table. Two fields (`creation_time` and `last_session_creation_time`) should be times, but `last_session_creation_time` is given as a Unix epoch timestamp. This won't be read as a date time using the `parse_dates` parameter, but can be converted later.

Since `opted_in_to_mailing_list` and `enabled_for_marketing_drip` are boolean, we explicitly convert them. Additionally, `invited_by_user_id` should be integer-typed, but the usual int type isn't nullable. We use Pandas Int64 type which is nullable.

In [5]:
users_df = pd.read_csv('./takehome_users.csv', 
                       parse_dates=['creation_time'], 
                       dtype= {'invited_by_user_id': 'Int64'})

In [None]:
users_df.head()

In [None]:
users_df.info()

In [None]:
users_df['last_session_creation_time'] = pd.to_datetime(users_df['last_session_creation_time'],unit='s')

In [None]:
users_df.head()

In [None]:
users_df.info()

Now, read the engagement table. Here, the only optional argument is to read the `time_stamp` column as a datetime.

In [6]:
engage_df = pd.read_csv('./takehome_user_engagement.csv', parse_dates=['time_stamp'])

In [None]:
engage_df.head()

In [None]:
engage_df.info()

In [None]:
engage_df.visited.unique()

In [None]:
engage_df.visited.value_counts()

There is no reason to continue carrying `visited` as a field as it adds nothing of value.

In [None]:
engage_df.drop('visited', axis=1, inplace=True)

# Feature Engineering

Most user data is present. There are two fields with null values - `last_session_creation_time` and `invited_by_user_id`. The latter is not surprising as we would not expect all users to join via invite. However, we might expect all users to have a `last_session_creation_time`. We can count the number of sessions per user.

In [None]:
session_count_df = engage_df.groupby('user_id').count()

It seems to be notable that there are 8823 rows in the grouped session data, the same as in the user data.

In [None]:
session_count_df.info()

Change the field name to something more descriptive.

In [None]:
session_count_df.rename({'time_stamp': 'session_count'}, axis=1, inplace=True)

In [None]:
session_count_df.info()

In [None]:
users_df = users_df.merge(session_count_df, how='left', left_on='object_id', right_index=True)

In [None]:
users_df.info()

Unsurprisingly, session counts are zero for users with null `last_session_creation_time`.

In [None]:
users_df[users_df.last_session_creation_time.isna()].info()

In the `session_count` field, null values have a natural interpretation as a zero count.

In [None]:
users_df.fillna(value={'session_count': 0}, inplace=True)

In [None]:
users_df.info()

The `session_count` column can be cast to integer now since it no longer has null values.

In [None]:
users_df['session_count'] = users_df['session_count'].astype(int)

In [None]:
users_df.info()

We'll add the number of invites from an individual user as well.

In [None]:
invites_df = users_df.groupby('invited_by_user_id')['object_id'].count()

In [None]:
invites_df.head()

In [None]:
users_df = users_df.merge(invites_df, how='left', left_on='object_id', right_index=True)

In [None]:
users_df.rename({'object_id_x':'object_id', 'object_id_y': 'invite_count'},axis=1, inplace=True)

In [None]:
users_df.info()

The `object_id` field became object type. We will change it back.

In [None]:
users_df['object_id'] = users_df['object_id'].astype(int)

As earlier, it makes sense for all null `invite_count` entries to be zero.

In [None]:
users_df.fillna(value={'invite_count': 0}, inplace=True)

In [None]:
users_df.info()

Rather than track who a user was invited by, we can track simply whether they were invited or not.

In [None]:
users_df['was_invited'] = users_df['invited_by_user_id'].notna().astype(int)

In [None]:
users_df.head()

In [None]:
users_df.was_invited.value_counts()

The field `org_id` is meaningless by itself, but we can track the number of users in each org grouping.

In [None]:
orgs_df = users_df[['object_id', 'org_id']].groupby('org_id').count()
orgs_df.rename({'object_id': 'org_count'}, axis=1, inplace=True)

In [None]:
orgs_df.head()

In [None]:
users_df = users_df.merge(orgs_df, how='left', left_on='org_id', right_index=True)

In [None]:
users_df.head()

In [None]:
users_df.info()

The specific time that an account was created is unlikely to tell us anything. However, we can determine how early/late the account was created relative to others.

In [None]:
min_creation_time = users_df.creation_time.min()
print(min_creation_time)

In [None]:
users_df['creation_days_since_start'] = users_df['creation_time'] - min_creation_time

In [None]:
users_df['creation_days_since_start'] = users_df['creation_days_since_start'].apply(lambda x: x.days)

In [None]:
users_df.head()

Most importantly, we need to determine whether or not users are adopted users. An adopted user is considered to be a user who has logged into the product on three separate days in at least one seven-day period. Since we're only given a single login timestamp for each session, we can't consider total session length.

In [None]:
users_df.session_count.describe()

In [None]:
# At least three separate days are required over a single seven day period. 
REQUIRED_COUNT = 3
REQUIRED_LENGTH = 7

def is_adopted_user(timestamps):
    '''
    Given a series of timestamps corresponding to a user's session logins, determines whether that user is an adopted user.
    '''
    # If there aren't even three sessions, there can't be three different days. 
    if len(timestamps) < REQUIRED_COUNT:
        return False
    
    # Convert the timestamps to a list of dates.
    # Although we are allowed to assume no two rows for a single user duplicate dates, we can easily eliminate duplicates
    #   by assigning to a set and then back to a list.
    dates = [t.date() for t in timestamps]
    # Sort the dates to make counting work easier.
    dates.sort()
    
    # Iterate over 3 session groups.
    # We compare the first session (index 0) with the third session (index 2) to determine if there are fewer than seven
    #   days passing between the sessions. If the time difference is seven days or more, the sessions weren't in the same
    #   seven day period. For example, if sessions are on January 1st, 4th, and 8th, then the time difference will be
    #   seven days, but the sessions were in an 8 day period.
    jmp = REQUIRED_COUNT - 1
    n = len(dates) - jmp
    for i in range(n):
        first_session = dates[i]
        last_session = dates[i + jmp]
        time_diff = last_session - first_session
        
        # time_diff is a dt.timedelta. Check the number of days in the period.
        if time_diff.days < REQUIRED_LENGTH:
            return True
    
    # Fall through. Iterating over the sessions found no acceptable adoption period. There were enough sessions to 
    #   potentially be an adopted user, but the sessions were too far apart.
    return False

In [None]:
# Broadcast a default value of False, then update users which are true.
users_df['is_adopted'] = 0
for user in engage_df.groupby('user_id'):
    user_no = user[0]
    timestamps = user[1].time_stamp
    if is_adopted_user(timestamps):
        users_df.loc[user_no, 'is_adopted'] = 1

In [None]:
users_df.info()

In [None]:
users_df.is_adopted.value_counts()

In [None]:
print(f'{1602/8823:0.3%}')

In [None]:
print(f'{1602/12000:0.3%}')

Of the 8823 users who logged in at least once, only 1602 (18.157%, 13.350% of all users) became adopted.

Of the remaining fields, `object_id`, `name` and `email` will not play a role in the analysis. Additionally, `creation_time`, `org_id`, `last_session_creation_time` and `invited_by_user_id` are now largely covered by other fields.

In [None]:
users_df.set_index('object_id',drop=True, inplace=True)

In [None]:
users_df.drop(['name', 'email', 'creation_time', 'org_id', 'last_session_creation_time', 'invited_by_user_id'], 
              axis=1, inplace=True)

In [None]:
users_df.info()

In [None]:
users_df = pd.concat(
        [users_df.drop('creation_source',axis=1), pd.get_dummies(users_df['creation_source'], prefix='source')],
         axis=1)

In [None]:
users_df.head()

In [None]:
users_df.info()

# EDA

In [None]:
users_df.describe()

Set aside users who login at least three times, as these are the only users that could potentially become adopted.

In [None]:
heavy_users = users_df.session_count >= 3

In [None]:
print(heavy_users.sum())

Normalize features.

In [None]:
users_df /= users_df.max()

In [None]:
users_df.describe()

In [None]:
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(users_df.corr(), cmap='vlag', vmin=-1, vmax=1, annot=True, center=0, fmt='0.3f',
            square=True, linewidths=.5, cbar_kws={"shrink": .8}, annot_kws={'rotation': 45});

The field `is_adopted` has little correlation with other fields at first glance. This remains the same when looking only at users with three or more sessions.

In [None]:
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(users_df[heavy_users].corr(), cmap='vlag', vmin=-1, vmax=1, annot=True, center=0, fmt='0.3f',
            square=True, linewidths=.5, cbar_kws={"shrink": .8}, annot_kws={'rotation': 45});

## PCA Reduction

In [None]:
pca = PCA(n_components='mle')
pca.fit(users_df)
print(pca.explained_variance_ratio_)

In [None]:
np.cumsum(pca.explained_variance_ratio_)

The PCA reduction only eliminates two dimensions, so we don't gain much.

# Model Fitting

There is some imbalance in our target feature, so we stratify when creating our train/test splits.

In [None]:
X = users_df.drop('is_adopted', axis=1)
y = users_df['is_adopted']
TEST_SIZE = 0.20
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=users_df['is_adopted'], 
                                                    random_state=RANDOM_STATE)

In [None]:
scores = {}
def score_classifier(name, y_true, y_pred):
    clf_scores = {}
    clf_scores['acc'] = metrics.accuracy_score(y_true, y_pred)
    clf_scores['bal_acc'] = metrics.balanced_accuracy_score(y_true, y_pred)
    clf_scores['f1'] = metrics.f1_score(y_true, y_pred)
    clf_scores['prec'] = metrics.precision_score(y_true, y_pred, zero_division=0)
    clf_scores['rec'] = metrics.recall_score(y_true, y_pred)
    scores[name] = clf_scores

In [None]:
def train_and_score_classifier(name, clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score_classifier(name, y_test, y_pred)

In [None]:
def print_classifier_scores(name):
    clf_scores = scores[name]
    print(f'Accuracy: {clf_scores["acc"]}')
    print(f'Balanced Accuracy: {clf_scores["bal_acc"]}')
    print(f'F1: {clf_scores["f1"]}')
    print(f'Precision: {clf_scores["prec"]}')
    print(f'Recall: {clf_scores["rec"]}')

## Baselines

Two dummy classifiers are trained to provide baselines. The first always chooses the majority class and the second makes a random choice based on the proportion of adopted users in the data set.

In [None]:
dummy_freq_clf = DummyClassifier(strategy="most_frequent")
train_and_score_classifier('dummy_freq', dummy_freq_clf)
print_classifier_scores('dummy_freq')

In [None]:
dummy_strat_clf = DummyClassifier(strategy="stratified", random_state=RANDOM_STATE)
train_and_score_classifier('dummy_strat', dummy_strat_clf)
print_classifier_scores('dummy_strat')

## Logistic Regression

It seems that logistic regression on its own always predicts no adoption.

In [None]:
lr_clf = LogisticRegression()
train_and_score_classifier('logistic', lr_clf)
print_classifier_scores('logistic')

A balanced regression is an improvement.

In [None]:
bal_lr_clf = LogisticRegression(class_weight='balanced')
train_and_score_classifier('bal_logistic', bal_lr_clf)
print_classifier_scores('bal_logistic')

In [None]:
lr_l1_clf = LogisticRegression(penalty='l1', solver='liblinear')
train_and_score_classifier('logistic_l1_reg', lr_l1_clf)
print_classifier_scores('logistic_l1_reg')

In [None]:
bal_lr_l1_clf = LogisticRegression(penalty='l1', solver='liblinear', class_weight='balanced')
train_and_score_classifier('bal_logistic_l1_reg', bal_lr_l1_clf)
print_classifier_scores('bal_logistic_l1_reg')

## KNN

In [None]:
knn_clf = KNeighborsClassifier()
train_and_score_classifier('knn_reg', knn_clf)
print_classifier_scores('knn_reg')

## Ridge Regression

In [None]:
rc_clf = RidgeClassifier()
train_and_score_classifier('ridge', rc_clf)
print_classifier_scores('ridge')

In [None]:
bal_rc_clf = RidgeClassifier(class_weight='balanced')
train_and_score_classifier('bal_ridge', bal_rc_clf)
print_classifier_scores('bal_ridge')

## Naive Bayes

In [None]:
nb_clf = BernoulliNB()
train_and_score_classifier('naive_bayes', nb_clf)
print_classifier_scores('naive_bayes')

## Random Forests

Unlike with logistic regression, balancing the class weight doesn't improve the random forest results.

In [None]:
rf_clf = RandomForestClassifier(random_state=RANDOM_STATE)
train_and_score_classifier('random_forest', rf_clf)
print_classifier_scores('random_forest')

In [None]:
bal_rf_clf = RandomForestClassifier(class_weight='balanced', random_state=RANDOM_STATE)
train_and_score_classifier('bal_random_forest', bal_rf_clf)
print_classifier_scores('bal_random_forest')

## Support Vector Classifier

In [None]:
svc_clf = LinearSVC(dual=False)
train_and_score_classifier('svc', svc_clf)
print_classifier_scores('svc')

As with logistic regression, we see a slight improvement with a balanced class weight.

In [None]:
bal_svc_clf = LinearSVC(class_weight='balanced', dual=False)
train_and_score_classifier('bal_svc', bal_svc_clf)
print_classifier_scores('bal_svc')

## Gradient Boosting

In [None]:
gb_clf = HistGradientBoostingClassifier(random_state=RANDOM_STATE)
train_and_score_classifier('grad_boost', gb_clf)
print_classifier_scores('grad_boost')