# Modeling

In [277]:
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from itertools import product
from collections import defaultdict
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

import sys
sys.path.append("../model")
from new_model import Pipeline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Read data

In [283]:
df = get_data()
df["current_state"] = ((df["current_state"] != "finished") & (df["current_state"] != "started")).astype(int)
y = df.pop("current_state").values
pipeline = Pipeline()
pipeline.fit(df, y)

In [284]:
X_train, X_test, y_train, y_test = train_test_split(pipeline.df, pipeline.y)

In [285]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [286]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)
print("Accuracy: {}".format(lr.score(X_test, y_test)))
print("Precision: {}".format(precision_score(y_test, predictions)))
print("Recall: {}".format(recall_score(y_test, predictions)))

Accuracy: 0.8898900110680686
Precision: 0.9474710881984116
Recall: 0.7430301895039181


In [245]:
# taken from churn case study solution
def plot_feature_importances(df, feature_importances):
    feat_scores = pd.DataFrame({'Mean Decrease Impurity' : feature_importances},
                           index=df.columns)
    feat_scores = feat_scores.sort_values(by='Mean Decrease Impurity')
    feat_scores.plot(kind='barh', figsize=(8,8))

In [246]:
# stolen from sklearn's example
def plot_confusion_matrix(cm):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    plt.xticks(np.arange(2), ["Finished", "Cancelled"], rotation=45)
    plt.yticks(np.arange(2), ["Finished", "Cancelled"])
    for i, j in product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment="center",
                 color="white" if cm[i, j] > cm.max() / 2 else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [269]:
plot_feature_importances(df, np.abs(lr.coef_[0]))

In [271]:
cm = confusion_matrix(y_test, predictions)
plot_confusion_matrix(cm)

In [249]:
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print("Accuracy: {}".format(rf.score(X_test, y_test)))
print("Precision: {}".format(precision_score(y_test, predictions)))
print("Recall: {}".format(recall_score(y_test, predictions)))

Accuracy: 0.9456339781063481
Precision: 0.9788534494871435
Recall: 0.8874710715726448


In [272]:
plot_feature_importances(df, rf.feature_importances_)

In [273]:
cm = confusion_matrix(y_test, predictions)
plot_confusion_matrix(cm)

In [1097]:
gbc = GradientBoostingClassifier(n_estimators=20, max_depth=8, subsample=0.5, 
                                 max_features='auto', learning_rate=0.05)
gbc.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='deviance', max_depth=8,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=20,
              presort='auto', random_state=None, subsample=0.5, verbose=0,
              warm_start=False)

In [1098]:
gbc.score(X_test, y_test)

0.90986262768580484

In [275]:
plot_feature_importances(df_model, gbc.feature_importances_)

In [274]:
predictions = gbc.predict(X_test)
cm = confusion_matrix(y_test, predictions)
plot_confusion_matrix(cm)