In [24]:
### Author: Matthew Oremland

# import necessary packages
import json
print 'JSON', json.__version__

import pandas as pd
print 'pandas', pd.__version__
import numpy as np
print 'numpy', np.__version__
import sklearn
print 'sklearn', sklearn.__version__
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

JSON 2.0.9
pandas 0.24.2
numpy 1.16.0
sklearn 0.19.2


In [2]:
# Read data, cast as pandas DataFrame
with open('challenge.json') as json_file:  
    raw_data = json.load(json_file)

data = pd.DataFrame(raw_data)

In [3]:
# Remove data with no variation; it cannot be used in any meaningful way. For this analysis, also remove date data
remove_list = []
for var in data.columns.tolist():
    if len(data[var].unique()) == 1:
        print 'Removing', var, '- no variation.'
        remove_list.append(var)
data = data.drop(columns=remove_list+['register_date'])

Removing branch_id - no variation.


In [4]:
# create dummy variables for categorical variables
category_vars = ['segment_code', 'sales_channel', 'group_code']
for cvar in category_vars:
    dummy_list = pd.get_dummies(data[cvar], prefix=cvar)
    data = data.join(dummy_list)
data = data.drop(columns=category_vars)

In [5]:
# Define X and y data sets; separate into train and test sets
all_vars, yvar = data.columns.tolist(), 'is_churn'
X = data[[v for v in all_vars if v != yvar]]
y = data[[yvar]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [8]:
# establish baseline naive model for comparison: zero model predicts 0 for each entry
churn_data = data[yvar]
churn_vals = [float(c) for c in churn_data.tolist() if c != None]
zero_model_score = round(1 - sum(churn_vals)/float(len(churn_vals)), 3)
print 'Zero classifier model score:', zero_model_score

Zero classifier model score: 0.809


In [9]:
# run logistic regressions individually and overall
# display scores of models on test set
quant_vars = ['total_price', 'unit_price', 'item_total_price', 'quantity']
category_vars = ['sales_channel', 'segment_code', 'group_code']
LogReg = LogisticRegression()
for qv in quant_vars:
    logisticModel = LogReg.fit(X_train[[qv]], y_train.values.ravel())
    print qv, 'score improvement on zero model:', round(logisticModel.score(X_test[[qv]], y_test), 3) - zero_model_score
for cv in category_vars:
    pars = [v for v in X_train.columns.tolist() if cv in v]
    logisticModel = LogReg.fit(X_train[pars], y_train.values.ravel())
    print cv, 'score improvement on zero model:', round(logisticModel.score(X_test[pars], y_test), 3) - zero_model_score
logisticModel = LogReg.fit(X_train, y_train.values.ravel())
print 'overall model score improvement on zero model:', round(logisticModel.score(X_test, y_test), 3) - zero_model_score

total_price score improvement on zero model: 0.011
unit_price score improvement on zero model: -0.007
item_total_price score improvement on zero model: -0.001
quantity score improvement on zero model: -0.001
sales_channel score improvement on zero model: 0.068
segment_code score improvement on zero model: 0.114
group_code score improvement on zero model: 0.115
overall model score improvement on zero model: 0.114


In [21]:
# Build a Random Forest Classifier model and determine the score on the test set
RF = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
RF.fit(X_train, y_train)
print 'Random Forest score improvement on zero model:', round(RF.score(X_test, y_test), 3) - zero_model_score

  This is separate from the ipykernel package so we can avoid doing imports until


Random Forest score improvement on zero model: 0.148
