## Relax Inc. Data science challenge

In [1]:
import pandas as pd
import chardet
import time

### detect encoding type in the file using chardet
Input must be read as bytes

In [2]:
chardet.detect(open('users.csv','rb').read())#confidence 1 implies 100% accurate evaluation

{'confidence': 0.7298523315812625, 'encoding': 'ISO-8859-1', 'language': ''}

In [3]:
users = pd.read_csv('users.csv', encoding='ISO-8859-1', index_col='object_id')
engagement = pd.read_csv('user_engagement.csv')

In [4]:
users.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0


In [5]:
engagement.head()

Unnamed: 0,time_stamp,user_id,visited
0,2014-04-22 03:53:30,1,1
1,2013-11-15 03:45:04,2,1
2,2013-11-29 03:45:04,2,1
3,2013-12-09 03:45:04,2,1
4,2013-12-25 03:45:04,2,1


Visited column has all 1s, so drop visited<br>
Convert time_stamp to datetime

In [6]:
engagement.drop(['visited'], axis=1, inplace=True)
engagement['time_stamp'] = pd.to_datetime(engagement['time_stamp'])

In [7]:
from datetime import datetime, timedelta

def custom(x):
    """
    Takes property object x
    converts x to list
    returns 1 if a user logged into the product on three separate
    days in at least one seven day period 
    """
    if len(x) >= 3:  # condition to eliminate if user has less than 3 time stamps
        x = [i for i in x]  # convert property object to list
        x.sort()  # sort the dates in increasing order
        x = [x[i+1] - x[i] for i in range(len(x)-2)]  # compute cumulative difference of current and next day
        # sum i, i+1, i+2 terms and check if it`s less than 7 which gives if it`s in a seven day period
        x = [1 for i in range(len(x)-2) if x[i] + x[i+1] + x[i+2] <= timedelta(days=7)]
        # condition to check if there`s a 1 in x, return 1 if true
        if 1 in x:
            return 1

df1 = engagement.groupby('user_id').agg(custom)  # group by user_id and aggregate using custom function
df1.fillna(0, inplace=True)  # fill null values with 0
df1.columns = ['adopted_user']
df1.head()

Unnamed: 0_level_0,adopted_user
user_id,Unnamed: 1_level_1
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0


In [8]:
print('Total number of users:', len(users))
print('Number of adopted users:', len(df1[df1['adopted_user'] == 1]))

Total number of users: 12000
Number of adopted users: 1322


In [9]:
final = users.join(df1, how='left')
final.head()

Unnamed: 0_level_0,creation_time,name,email,creation_source,last_session_creation_time,opted_in_to_mailing_list,enabled_for_marketing_drip,org_id,invited_by_user_id,adopted_user
object_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,2014-04-22 03:53:30,Clausen August,AugustCClausen@yahoo.com,GUEST_INVITE,1398139000.0,1,0,11,10803.0,0.0
2,2013-11-15 03:45:04,Poole Matthew,MatthewPoole@gustr.com,ORG_INVITE,1396238000.0,0,0,1,316.0,0.0
3,2013-03-19 23:14:52,Bottrill Mitchell,MitchellBottrill@gustr.com,ORG_INVITE,1363735000.0,0,0,94,1525.0,0.0
4,2013-05-21 08:09:28,Clausen Nicklas,NicklasSClausen@yahoo.com,GUEST_INVITE,1369210000.0,0,0,1,5151.0,0.0
5,2013-01-17 10:14:20,Raw Grace,GraceRaw@yahoo.com,GUEST_INVITE,1358850000.0,0,0,193,5240.0,0.0


### Feature engineering
* We don\`t need name, email.
* Fill adopted_user null values with 0, because those could be considered not adopted users
* Let\`s also fill invited_by_user_id null values with 0 and check how it performs. 0 because we can consider 0 to be invited by none.
* Drop null rows
* Make a column usage which is difference between last_session_creation_time and creation_time
* Map strings to ints

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, confusion_matrix
from sklearn import decomposition
from sklearn.preprocessing import MinMaxScaler

In [44]:
x = final.drop(['name', 'email'], axis=1)

creation = {'PERSONAL_PROJECTS': 1,
            'GUEST_INVITE': 2,
            'ORG_INVITE': 3,
            'SIGNUP': 4,
            'SIGNUP_GOOGLE_AUTH': 5}
x['creation_source'] = x['creation_source'].map(lambda x: creation[x])

x['invited_by_user_id'].fillna(0, inplace=True)

x.dropna(axis=0, inplace=True)

x['last_session_creation_time'] = x['last_session_creation_time'].map(lambda x: datetime.
                                            fromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))

# some last_session_creation_time and creation time are wrong
x['last_session_creation_time'] = pd.to_datetime(x['last_session_creation_time'])
x['creation_time'] = pd.to_datetime(x['creation_time'])

x['usage'] = x['last_session_creation_time'] - x['creation_time']
x.drop(['creation_time', 'last_session_creation_time'], axis=1, inplace=True)
x = x[x['usage'] >= timedelta(days=0)]

x['usage'] = x['usage'].map(lambda x: x.total_seconds())

complete = x

## Decision Tree classifier
### Without including 'usage' feature

In [52]:
y = complete['adopted_user']
x = complete.drop(['adopted_user', 'usage'], axis=1)
scaler = MinMaxScaler()
scaler.fit(x)
scaler.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, stratify=y)
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# score
print('Feature importance')
print(tree.feature_importances_)
print('F1 score {}'.format(f1_score(y_test, y_pred)))
# print('Precision {}'.format(precision_score(y_test, y_pred)))
# print('Recall {}'.format(recall_score(y_test, y_pred)))
print('Test Accuracy {}'.format(accuracy_score(y_test, y_pred)))
print()
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Feature importance
[ 0.05812533  0.07204793  0.06203569  0.46855366  0.3392374 ]
F1 score 0.20854271356783924
Test Accuracy 0.6706743335075798

Confusion Matrix
[[1200  277]
 [ 353   83]]


### Including 'usage' feature

In [54]:
y = complete['adopted_user']
x = complete.drop(['adopted_user'], axis=1)
scaler = MinMaxScaler()
scaler.fit(x)
scaler.fit_transform(x)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42, stratify=y)
tree = DecisionTreeClassifier()

tree.fit(X_train, y_train)

y_pred = tree.predict(X_test)

# score
print(x.columns)
print('Feature importance')
print(tree.feature_importances_)
print('F1 score {}'.format(f1_score(y_test, y_pred)))
# print('Precision {}'.format(precision_score(y_test, y_pred)))
# print('Recall {}'.format(recall_score(y_test, y_pred)))
print('Test Accuracy {}'.format(accuracy_score(y_test, y_pred)))
print()
print('Confusion Matrix')
print(confusion_matrix(y_test, y_pred))

Index(['creation_source', 'opted_in_to_mailing_list',
       'enabled_for_marketing_drip', 'org_id', 'invited_by_user_id', 'usage'],
      dtype='object')
Feature importance
[ 0.01458177  0.0043199   0.00272969  0.06316761  0.03383866  0.88136237]
F1 score 0.8700696055684455
Test Accuracy 0.9414532148457919

Confusion Matrix
[[1426   51]
 [  61  375]]


* When 'usage' feature is included, Decision tree is classfifying with better accuracy and F1 score compared to when 'usage' feature is not included
* We can consider this Feature importance from Decision trees to know the more important features in predicting adopted user because the model is giving good result
     * The order of Feature importance is
         1. usage
         2. org_id - seems reasonable
         3. invited_by_user_id - may be 0 (no reference) is used to classify that is why it is given more importance
         4. creation_source
         5. opted_in_to_mailing_list
         6. enabled_for_marketing_drip