In [1]:
%matplotlib notebook
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('equity_value_data.csv')
#data.drop('Unnamed: 0',axis = 1, inplace =True)
data['date'] = [dt.datetime.strptime(i[0:10], "%Y-%m-%d") for i in data.timestamp]
#data.head()

In [None]:
# timestring = [i[11:19] for i in data.timestamp]
# x = pd.DataFrame(timestring)
# x.rename(columns = {0:'time'},inplace = True)
# x.time.unique()

In [6]:
# np.min(data.close_equity)
## 10.0

In [7]:
############# ANSWER TO QUESTION A ###############
data['interval'] = pd.DataFrame(data.groupby('user_id')['date'].diff())
data['interval_days'] = [i.total_seconds()/(24 * 60 * 60) for i in data['interval']]
churndata = data[data.interval_days >= 28] 
churnid = list(churndata.user_id.unique())
churnrate = len(churnid)/len(list(data.user_id.unique()))
print('Answer A: The percentage of users that have churned is')
print(churnrate)

Answer A: The percentage of users that have churned is
0.04996418338108882


In [8]:
features = pd.read_csv('features_data.csv')
features['churned'] = features.user_id.apply(lambda x: 1 if x in churnid else 0)
# print(features.head())
# print(features.columns)

In [9]:
for i in ['risk_tolerance', 'investment_experience', 'liquidity_needs',
       'platform', 'instrument_type_first_traded','time_horizon']:
    print(i + ' has the following values:')
    print(features[i].unique())
    print()

risk_tolerance has the following values:
['high_risk_tolerance' 'med_risk_tolerance' 'low_risk_tolerance']

investment_experience has the following values:
['limited_investment_exp' 'no_investment_exp' 'good_investment_exp'
 'extensive_investment_exp']

liquidity_needs has the following values:
['very_important_liq_need' 'not_important_liq_need'
 'somewhat_important_liq_need']

platform has the following values:
['Android' 'iOS' 'both']

instrument_type_first_traded has the following values:
['stock' 'etp' 'adr' 'wrt' 'mlp' '0' 'lp' 'rlt' 'reit' 'cef' 'tracking']

time_horizon has the following values:
['med_time_horizon' 'short_time_horizon' 'long_time_horizon']



In [10]:
# dfinst = pd.DataFrame(df.groupby('instrument_type_first_traded')['user_id'].count())
# dfinstchurned = pd.DataFrame(df[df.churned == 1].groupby('instrument_type_first_traded')['user_id'].count())
# print(dfinst.sort_values('user_id',ascending = False))
# print(dfinstchurned.sort_values('user_id', ascending = False))
# (dfinstchurned/dfinst).sort_values('user_id', ascending = False)

In [11]:
df = features.copy()
#********* encode category features
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
ohe = OneHotEncoder(sparse = False)
ohedf = pd.DataFrame(ohe.fit_transform(df[['platform','instrument_type_first_traded']]))
ohedf.rename(columns = {0:'Android', 1:'both', 2:'iOS',
                       3:'0',4:'adr',5:'cef',6:'etp',7:'lp',8:'mlp',9:'reit',10:'rlt',11:'stock',12:'tracking',13:'wrt'}, inplace = True)

oe = OrdinalEncoder(categories = [['low_risk_tolerance', 'med_risk_tolerance', 'high_risk_tolerance'],
                                  ['no_investment_exp','limited_investment_exp','good_investment_exp','extensive_investment_exp'],
                                  ['not_important_liq_need','somewhat_important_liq_need','very_important_liq_need'],
                                  ['short_time_horizon','med_time_horizon','long_time_horizon']
                                 ])
oedf = pd.DataFrame(oe.fit_transform(df[['risk_tolerance','investment_experience','liquidity_needs','time_horizon']]))
oedf.rename(columns = dict(zip([0,1,2,3],['risk_tolerance','investment_experience','liquidity_needs','time_horizon'])), inplace = True)  

dffinal = pd.merge(ohedf, oedf, how = 'outer',left_index = True, right_index = True)
dffinal = pd.merge(df[['time_spent','first_deposit_amount','user_id','churned']],dffinal, how = 'outer', left_index = True, right_index = True)
dffinal.set_index('user_id',inplace = True)
#dffinal.head()

In [12]:
#********* scaling features with max > 1
to_scale = [col for col in dffinal.columns if dffinal[col].max()>1]
mms = MinMaxScaler()
scaled = pd.DataFrame(mms.fit_transform(dffinal[to_scale]),columns = to_scale)
scaled.set_index(dffinal.index, inplace = True)

dffinal.drop(to_scale, axis = 1, inplace = True)
dffinal = pd.concat([scaled,dffinal],axis = 1)

In [13]:
#******** display all fields from the final data set
fields_all = [
              'time_spent', 'first_deposit_amount', 
              'risk_tolerance','investment_experience', 'liquidity_needs', 'time_horizon', 
              'churned',
              'Android', 'both', 'iOS', 
              '0', 'adr', 'cef', 'etp', 'lp', 'mlp', 'reit', 'rlt', 'stock', 'tracking', 'wrt'
             ]

In [14]:
#******** remove one dummy from each non-ordinal category to avoid multicollinearity
# remove 'Android' based on correlation
# remove  'Stock'  based on VIF
# remove 'churned' the dependent variable
features_selected = [
                    'time_spent', 'first_deposit_amount', 
                    'risk_tolerance','investment_experience', 'liquidity_needs', 'time_horizon', 
                    'both', 'iOS',
                    '0','adr', 'cef', 'etp', 'lp', 'mlp', 'reit', 'rlt','tracking', 'wrt' 
                     ]

In [15]:
# Define feature set and DV
X = dffinal[features_selected]
y = dffinal['churned']

# Use VIF to check multicollinearity
vif = pd.DataFrame([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
#print(vif)
# vif of stock is still pretty large, hence removed
#X.drop('stock', axis = 1, inplace = True)
#vif = pd.DataFrame([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
#print(vif)

In [16]:
correlated_features = set()
correlation_matrix = X.corr()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.7:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
print(correlated_features)

set()


In [17]:
# Remove correlated features
#X.drop('iOS', axis = 1, inplace = True)

# Use SMOTE to deal with imbalanced data
sm = SMOTE(random_state = 50)
X_sm, y_sm = sm.fit_resample(X,y)

# Train-Test splitting
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, random_state = 50)

In [18]:
# print(X.shape)
# print(X_sm.shape)
# print(X_train.shape)

In [19]:
# Use RFECV to select features

#**** define a classifer
#clf = LogisticRegression()
#clf = KNeighborsClassifier(n_neighbors=5)
clf = RandomForestClassifier(random_state = 100)

# define RFECV
rfecv  = RFECV(estimator = clf, step = 1, scoring = 'f1')

# fit a model
rfecv.fit(X_train, y_train)

RFECV(estimator=RandomForestClassifier(random_state=100), scoring='f1')

In [20]:
print('Optimal number of features: {}'.format(rfecv.n_features_))

Optimal number of features: 13


In [21]:
# plt.figure(figsize=(10, 7))
# plt.title('Recursive Feature Elimination with CV', fontsize=13, fontweight='bold')
# plt.xlabel('Number of features used', fontsize=13)
# plt.ylabel('classification score', fontsize=13)
# plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=2)
# plt.show()

In [22]:
#rfecv.support_

In [23]:
# Drop features that were not selected
X.drop(X.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X_sm.drop(X_sm.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X_train.drop(X_train.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)
X_test.drop(X_test.columns[np.where(rfecv.support_ == False)[0]], axis=1, inplace=True)

In [24]:
###################### Answer to QUESTION C ##################
featureimportance = pd.DataFrame(pd.Series(dict(zip(X.columns,rfecv.estimator_.feature_importances_))))
featureimportance.rename(columns = {0:'importance'}).sort_values('importance', ascending = False)

Unnamed: 0,importance
first_deposit_amount,0.340412
time_spent,0.283236
investment_experience,0.120819
risk_tolerance,0.064993
time_horizon,0.056948
liquidity_needs,0.048801
iOS,0.027806
etp,0.015572
both,0.014681
adr,0.012557


In [25]:
#**** Re-fit the model
clf.fit(X_train, y_train)
y_predicted = clf.predict(X_test)

print(classification_report(y_test, y_predicted))
print(confusion_matrix(y_test, y_predicted))

#*** perform cross validation
print(cross_val_score(clf, X_train, y_train, cv = 5, scoring = 'precision').mean())
print(cross_val_score(clf, X_train, y_train, cv = 5, scoring = 'recall').mean())

              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1326
           1       0.93      0.93      0.93      1327

    accuracy                           0.93      2653
   macro avg       0.93      0.93      0.93      2653
weighted avg       0.93      0.93      0.93      2653

[[1235   91]
 [  96 1231]]
0.9222054118746339
0.9145308302518883


In [26]:
#**** Apply the model to the original dataset (without re-sampling from SMOTE)
y_OriginalSample = clf.predict(X)
print(classification_report(y, y_OriginalSample))
print(confusion_matrix(y, y_OriginalSample))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      5305
           1       0.72      0.82      0.76       279

    accuracy                           0.97      5584
   macro avg       0.85      0.90      0.88      5584
weighted avg       0.98      0.97      0.98      5584

[[5214   91]
 [  50  229]]


In [27]:
###################### Answer to QUESTION B ##################
pd.DataFrame(y_OriginalSample).set_index(dffinal.index).to_csv('classification_result.csv')
pd.DataFrame(y_OriginalSample).set_index(dffinal.index)

Unnamed: 0_level_0,0
user_id,Unnamed: 1_level_1
895044c23edc821881e87da749c01034,1
458b1d95441ced242949deefe8e4b638,0
c7936f653d293479e034865db9bb932f,0
b255d4bd6c9ba194d3a350b3e76c6393,0
4a168225e89375b8de605cbc0977ae91,0
...,...
03880c726d8a4e5db006afe4119ad974,1
ae8315109657f44852b24c6bca4decd6,1
f29c174989f9737058fe808fcf264135,0
24843497d1de88b2e7233f694436cb3a,0
