In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_feather("results/cleaned_df.feather")

In [3]:
df.sample(5)

Unnamed: 0,CreditScore,GeographyGermany,GeographySpain,GenderMale,Age,Tenure,Balance,NumOfProducts,HasCrCard1,IsActiveMember1,EstimatedSalary,Exited
3919,763.0,0.0,1.0,0.0,39.0,7.0,0.0,2.0,1.0,0.0,9.876104,0
3982,614.0,0.0,0.0,1.0,46.0,4.0,0.0,1.0,1.0,0.0,11.21695,1
5297,572.0,1.0,0.0,1.0,51.0,8.0,11.490179,3.0,1.0,1.0,12.170525,1
9225,594.0,1.0,0.0,0.0,32.0,4.0,11.69588,2.0,1.0,1.0,12.001277,0
9083,427.0,0.0,1.0,1.0,40.0,8.0,0.0,2.0,1.0,1.0,11.32505,0


In [4]:
df.describe()

Unnamed: 0,CreditScore,GeographyGermany,GeographySpain,GenderMale,Age,Tenure,Balance,NumOfProducts,HasCrCard1,IsActiveMember1,EstimatedSalary
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,0.2509,0.2477,0.5457,38.9218,5.0128,7.441327,1.5302,0.7055,0.5151,11.208386
std,96.653299,0.433553,0.431698,0.497932,10.487806,2.892174,5.6064,0.581654,0.45584,0.499797,1.000216
min,350.0,0.0,0.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,2.532108
25%,584.0,0.0,0.0,0.0,32.0,3.0,0.0,1.0,0.0,0.0,10.839642
50%,652.0,0.0,0.0,1.0,37.0,5.0,11.484521,1.0,1.0,1.0,11.514873
75%,718.0,1.0,0.0,1.0,44.0,7.0,11.75701,2.0,1.0,1.0,11.914311
max,850.0,1.0,1.0,1.0,92.0,10.0,12.432806,4.0,1.0,1.0,12.20604


Everything seems to be in order. Let's try two other strategies for evaluating feature importance.

In [5]:
y = df.loc[:, "Exited"]
X = df.drop('Exited', 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

## Logistic Regression (With L1 Regularization)

L1 regularization encourages sparsity. With a strong enough regularization penalty, the model can determine which features are not worth the cost. I could get a relative ranking of the features by increasing the penalty and noting which features drop out first, but I'm running out of time.

In [70]:
log_reg = LogisticRegression(penalty='l1', 
                               C=2**-8, # Regularization strength. Smaller is stronger
                               solver='liblinear')
log_reg.fit(X_train, y_train)
y_predict_proba = log_reg.predict_proba(X_test)
y_predict = log_reg.predict(X_test)

In [71]:
log_loss(y_test, y_predict_proba)

0.46310564268084053

In [72]:
accuracy_score(y_test, y_predict)

0.79000000000000004

In [73]:
X_train.columns.values[log_reg.coef_.flatten() == 0] # Identify features with 0 weight

array(['GeographyGermany', 'GeographySpain', 'GenderMale', 'NumOfProducts',
       'HasCrCard1'], dtype=object)

First, notice that this regularized model fits more poorly than the naive model. I could adjust C parameter to be more forgiving, but I am more concerned with identifying the weak features then predictive power.

Geography, gender, the number of products and credit card all drop out. It's interesting that this model removed geography entirely while the previous model did not. If I were to increase regularization strength, I would predict that tenure and salary would appear next.

## Random Forest Feature Importance

Random forests are capable modeling nonlinear relationships. They tend to be excellent predictors as well.

By tracking the percentage of times a feature is including in a tree of limited depth, we can rank the relative information gain between features.

In [35]:
random_forest = RandomForestClassifier(max_depth=7)

In [36]:
random_forest.fit(X_train, y_train)
rf_predict_proba = random_forest.predict_proba(X_test)
rf_predict = random_forest.predict(X_test)
log_loss(y_test, rf_predict_proba)

0.35299731435725068

In [37]:
accuracy_score(y_test, rf_predict)

0.85799999999999998

Notice that the random forest is a better model than the previous two.

In [39]:
rf_feature_importance = random_forest.feature_importances_

In [42]:
features_ranked = np.argsort(rf_feature_importance) # Rank of features from worst to best

In [51]:
np.flip(X_train.columns.values[features_ranked], axis = 0) # Best to worst

array(['Age', 'NumOfProducts', 'IsActiveMember1', 'Balance',
       'GeographyGermany', 'CreditScore', 'EstimatedSalary', 'Tenure',
       'GenderMale', 'GeographySpain', 'HasCrCard1'], dtype=object)

A surprising result! The number of products was determined to be a weak feature in both the previous models, but now it's considered the second most important feature. If I had more time, I would try converting this feature into an ordered factor tease out this nonlinear relationship.

The random forest tends to agree with the other models on the other weak features. 