In [1]:
import warnings
warnings.filterwarnings('ignore') 
import csv
import numpy as np
import scipy.sparse as spr
import pandas as pd

with open('telecom_churn.csv', 'rt') as f:
    data = list(csv.DictReader(f))
data[:1]
for i in range(len(data)):
    data[i]['Churn'] = 1 if data[i]['Churn'] == 'True' else 0

In [2]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

_all_xs = [{k: v for k, v in row.items() if k != 'Churn'} for row in data]
_all_ys = np.array([int(row['Churn']) for row in data])

all_xs, all_ys = shuffle(_all_xs, _all_ys, random_state=0)
train_xs, valid_xs, train_ys, valid_ys = train_test_split(
    all_xs, all_ys, test_size=0.25, random_state=0)

print('{} items total'.format(len(all_xs)))

3333 items total


In [3]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

class CSCTransformer:
    def transform(self, xs):
        return xs.tocsc()
    def fit(self, *args):
        return self
    
clf = XGBClassifier()
vec = DictVectorizer()
pipeline = make_pipeline(vec, CSCTransformer(), clf)

def evaluate(_clf):
    scores = cross_val_score(_clf, all_xs, all_ys, scoring='accuracy', cv=10)
    print('Accuracy: {:.4f} ± {:.4f}'.format(np.mean(scores), 2 * np.std(scores)))
    _clf.fit(train_xs, train_ys)  # so that parts of the original pipeline are fitted
     
evaluate(pipeline)

Accuracy: 0.8788 ± 0.0195


In [4]:
from xgboostinterpreter import xgboostinterpreter as xgbi

In [5]:
xgbi.weight(clf, vec, 15)

+-----------+------------------------+-------+
|    Weight | Feature                | Value |
+-----------+------------------------+-------+
| 0.0498887 | International plan     |    No |
| 0.0475853 | Customer service calls |     4 |
| 0.0452156 | Total intl calls       |     2 |
| 0.0414905 | Customer service calls |     5 |
| 0.0312565 | Total intl calls       |     1 |
| 0.0292658 | Customer service calls |     6 |
| 0.0248381 | Number vmail messages  |     0 |
| 0.0180267 | State                  |    NJ |
| 0.0179378 | State                  |    TX |
| 0.0172371 | Total day calls        |   106 |
| 0.0169974 | Total intl charge      |  3.75 |
| 0.0153934 | Account length         |   133 |
|  0.015176 | Total day calls        |   125 |
| 0.0149604 | Total night calls      |   132 |
| 0.0146561 | Total day calls        |   124 |
+-----------+------------------------+-------+


In [6]:
xgbi.predict(clf, valid_xs[0], vec)

Answer:  0
with probability:  0.710383
+--------------+------------------------+-------+
| Contribution | Feature                | Value |
+--------------+------------------------+-------+
|     1.176790 | Total night calls      |   132 |
|     0.183176 | Number vmail messages  |     0 |
|     0.041739 | Customer service calls |     1 |
|     0.007670 | Total night calls      |   113 |
|     0.005896 | Total night calls      |    92 |
|     0.005811 | Account length         |    87 |
|     0.005522 | Total night calls      |    96 |
|     0.004717 | Total day calls        |   107 |
|     0.004552 | Total night calls      |   110 |
|     0.003040 | State                  |    IL |
|     0.002366 | Total intl charge      |  2.54 |
|     0.002361 | Total day calls        |    98 |
|     0.002302 | State                  |    WI |
|     0.002215 | State                  |    VA |
|     0.002130 | Total intl charge      |  3.02 |
|     0.002013 | Total day calls        |    70 |
|     0.001