In [None]:
# !pip3 install -U -q ipywidgets
# !pip3 install -U -q pyarrow
# !jupyter nbextension enable --py widgetsnbextension

In [1]:
# Standard Data Science Helpers
import numpy as np
import pandas as pd
import scipy
import random 

from sklearn.datasets import make_classification, make_blobs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt

import chart_studio.plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode(connected=True)


from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options
pd.options.display.max_rows = 30
pd.options.display.max_columns = 25

# Show all code cells outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import os
from IPython.display import Image, display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from sklearn.base import clone
 
from IPython.core.debugger import set_trace

In [2]:
#Some utility functions.

class DataContainer():
    def __init__(self):
        pass
    
    def set_x(self, X):
        self.X = X
    
    def set_y(self, y):
        self.y = y
        

from sklearn.metrics import classification_report, confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
    
def classification_assessment(X_test, y_test, y_test_predicted, clf):
    print(classification_report(y_test, y_test_predicted))
    cnf_matrix = confusion_matrix(y_test, y_test_predicted, labels=[1,0])
    np.set_printoptions(precision=2)

    # Plot non-normalized confusion matrix
    plt.figure()
    plot_confusion_matrix(cnf_matrix, classes=['incident=1','no incident=0'],normalize= False,  title='Confusion matrix')
    ns_probs = [0 for _ in range(len(y_test))]

    # plot ROC
    plt.figure()
    lr_probs = clf.predict_proba(X_test)
    # keep probabilities for the positive outcome only
    lr_probs = lr_probs[:, 1]
    # calculate scores
    ns_auc = roc_auc_score(y_test, ns_probs)
    lr_auc = roc_auc_score(y_test, lr_probs)
    # summarize scores
    print('No Skill: ROC AUC=%.3f' % (ns_auc))
    print('Logistic: ROC AUC=%.3f' % (lr_auc))
    # calculate roc curves
    ns_fpr, ns_tpr, ns_thresh = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, lr_thresh = roc_curve(y_test, lr_probs)
    # plot the roc curve for the model
    plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
    plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
    # axis labels
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    # show the legend
    plt.legend()
    # show the plot
    plt.show()

    return lr_probs
    
    
def dilute_class(X, y, class_tag, dilute_factor):
    y_indices = y==class_tag
    num_class_entries = sum(y_indices)
    diluted_ys_indices =  random.sample(range(num_class_entries), int(num_class_entries*dilute_factor))
    diluted_ys = y[y_indices][diluted_ys_indices]
    diluted_X_other_classes = X[~y_indices]
    diluted_X_class = X[y_indices][diluted_ys_indices]
    diluted_X = np.concatenate([diluted_X_other_classes, diluted_X_class])
    diluted_y = np.concatenate([ y[~y_indices], diluted_ys])
    
    return diluted_X, diluted_y

In [3]:
#See https://plotly.com/python/histograms/

%matplotlib qt

data_container = DataContainer()
from ipywidgets import interact, widgets
# correlations(column1=list(df.select_dtypes('number').columns), 
#                  column2=list(df.select_dtypes('number').columns)):

@interact_manual
def choose_dataset( num_samples=[10000, 500000, 1000000], 
                    center_zero=widgets.FloatSlider(min=-2,max=-0.5,step=0.1,value=-1),
                    center_one=widgets.FloatSlider(min=0.5,max=2,step=0.1,value=1),
                    std_zero=widgets.FloatSlider(min=0.2,max=2,step=0.1,value=0.8),
                    std_one=widgets.FloatSlider(min=0.2,max=2,step=0.1,value=0.5)
                  ):
    display(HTML(f'<h2>Plotting dataset of size {num_samples} <h2>'))
    dataset_artificial_balanced_1_feature = \
                            make_blobs(n_samples=num_samples, n_features=1, centers=[[center_zero], [center_one]],
                                cluster_std=[std_zero, std_one],  shuffle=False, random_state=4) 
    X, y = dataset_artificial_balanced_1_feature
    X = X
    data_container.set_x(X)
    data_container.set_y(y)
    X_ones = X[y==1].T[0]
    X_zeros = X[y==0].T[0]
    print("1: mean: {:0.3f}, std: {:0.3f} \n0: mean: {:0.3f}, std: {:0.3f} ".format(X_ones.mean(), X_ones.std(), X_zeros.mean(), X_zeros.std()))
 
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=X_ones, nbinsx=500, histnorm='probability density', opacity=0.7, name='ones'))
    fig.add_trace(go.Histogram(x=X_zeros, nbinsx=500, histnorm='probability density', opacity=0.7, name='zeros'))
    fig.show()

#     fig, ax_hist = plt.subplots()
#     ax_hist.hist(X_ones, bins = 500, density=True, alpha = 0.7, label="ones")
#     ax_hist.hist(X_zeros, bins = 500, density=True, alpha = 0.7, label = "zeros")
#     ax_hist.legend()
      


interactive(children=(Dropdown(description='num_samples', options=(10000, 500000, 1000000), value=10000), Floa…

In [4]:
# %matplotlib qt
probs_fig, probs_axs = plt.subplots(2,1)
data_container.probs_ax, data_container.x_ax = probs_axs
data_container.probs_ax.set_title("Classifier's probabilities over test set")
data_container.probs_ax.set_xlabel("x")
data_container.probs_ax.set_ylabel("prob(y(x)=1)")

data_container.probs_fig = probs_fig


Text(0.5, 1.0, "Classifier's probabilities over test set")

Text(0.5, 0, 'x')

Text(0, 0.5, 'prob(y(x)=1)')

<h2>Now, let's apply a standard classification and assess the results.</h2>

In [5]:
# %matplotlib inline
X_train, X_test, y_train, y_test = train_test_split(data_container.X, data_container.y, test_size=0.2)
clf_lr = LogisticRegressionCV(cv=5, random_state=4).fit(X_train, y_train)
y_hat_lr = clf_lr.predict(X_test)
prediction_probs = classification_assessment(X_test, y_test, y_hat_lr, clf_lr)
# %matplotlib qt
data_container.probs_ax.scatter(X_test, prediction_probs)
# plt.show()


              precision    recall  f1-score   support

           0       0.96      0.92      0.94      1007
           1       0.92      0.96      0.94       993

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000

Confusion matrix, without normalization
[[955  38]
 [ 81 926]]
No Skill: ROC AUC=0.500
Logistic: ROC AUC=0.983


<matplotlib.collections.PathCollection at 0x1323eaf50>

<h2> Next, we're going to dilute the '1' class to obtain an imbalanced dataset </h2>

In [6]:
import plotly.express as px

@interact_manual
def dilute_ones( dilute_factor=widgets.FloatSlider(min=0.01,max=1,step=0.01,value=0.01)):
    X = data_container.X
    y = data_container.y
    #Dilute the 1's class
#     ones_y_indices = y==1
#     num_ones = sum(ones_y_indices)
#     diluted_ys_indices =  random.sample(range(num_ones), int(num_ones*dilute_factor))
#     diluted_ys = y[ones_y_indices][diluted_ys_indices]
#     diluted_X_zeros = X[y==0]
#     diluted_X_ones = X[ones_y_indices][diluted_ys_indices]
#     diluted_X = np.concatenate([diluted_X_zeros, diluted_X_ones])
#     diluted_y = np.concatenate([ y[y==0], diluted_ys])
    diluted_X, diluted_y = dilute_class(X, y, 1, dilute_factor)
    data_container.diluted_X = diluted_X
    data_container.diluted_y = diluted_y
    diluted_X_zeros = diluted_X[diluted_y == 0]
    diluted_X_ones = diluted_X[diluted_y == 1]

    fig = go.Figure()
    fig.add_trace(go.Histogram(x=diluted_X_zeros.T[0], nbinsx=1000))
    fig.add_trace(go.Histogram(x=diluted_X_ones.T[0], nbinsx=1000))
    fig.show()
    
    data_container.x_ax.hist(diluted_X_zeros.T[0], bins=1000)
    data_container.x_ax.hist(diluted_X_ones.T[0], bins=1000)
    
#     fig.show()
#     plt.figure()
#     plt.hist(diluted_X, bins=1000, density=True)
#     plt.show()

interactive(children=(FloatSlider(value=0.01, description='dilute_factor', max=1.0, min=0.01, step=0.01), Butt…

In [13]:
#Predict on diluted data, optionally dilute 0 class as well.

def apply_clf_correction(non_corrected_clf, correction_method, y, diluted_X, diluted_y):
    corrected_clf = clone(non_corrected_clf)
    #Apply correction
    tau = sum(y)/len(y)
    y_bar =  sum(diluted_y)/len(diluted_y)
    if correction_method == "prior":
        corrected_intercept = non_corrected_clf.intercept_ - np.log( (1-tau)/tau*y_bar/(1-y_bar)    )
        corrected_clf.intercept_ = corrected_intercept
    elif correction_method == "weighting":
        w1 = tau/y_bar
        w0 = (1-tau)/(1-y_bar)
        corrected_clf = LogisticRegressionCV(cv=5, random_state=0, class_weight={0:w0, 1:w1}).fit(diluted_X, diluted_y)
#         y_hat_D = diluted_clf_weighing.predict(X_test_D)
#         print(classification_report(y_hat_D, y_test_D ))
#         probs_D = diluted_clf_weighing.predict_proba(X_test_D)
#         print("Theta0_D: {}, theta1_D: {}".format(diluted_clf_weighing.intercept_, diluted_clf_weighing.coef_))
#         ax1.scatter(X_test_D, probs_D[:,1])
#         y_hat_all_weighing = diluted_clf_weighing.predict(X_test)
#         print(classification_report(y_hat_all_weighing, y_test))
#         probs_all_weighing = diluted_clf_weighing.predict_proba(X_test)
#         ax1.scatter(X_test, probs_all_weighing[:,1])

    return corrected_clf
        
@interact_manual
def dilute_zeros_and_predict( zero_dilute_factor=widgets.FloatSlider(min=0.01,max=1,step=0.01,value=1), 
                           correction_method = ['none', 'prior', 'weighting', 'gev']):
    diluted_X = data_container.diluted_X
    diluted_y = data_container.diluted_y
    diluted_X_zeros = diluted_X[diluted_y == 0]
    diluted_X_ones = diluted_X[diluted_y == 1]

    #Dilute the 0's class
   
    diluted_X, diluted_y = dilute_class(diluted_X, diluted_y, 0, zero_dilute_factor)
    diluted_X_zeros = diluted_X[diluted_y == 0]
    diluted_X_ones = diluted_X[diluted_y == 1]
    y = data_container.y
    diluted_y = data_container.diluted_y
    tau = sum(y)/len(y)
    y_bar =  sum(diluted_y)/len(diluted_y)
    #Predict
    display(HTML(f'<h3>Using a dataset of size {len(diluted_y)}, {len(diluted_X_zeros)} of which are 0,\
    and {len(diluted_X_ones)} are 1 <h3>'))
    #Not sure I need this - the test set is the original one.
#     X_train_D, X_test_D, y_train_D, y_test_D = train_test_split(diluted_X, diluted_y, test_size = 0.2) 
#     print(sum(y_train_D))
#     print(sum(y_test_D))
    diluted_clf = LogisticRegressionCV(cv=5, random_state=0).fit(diluted_X, diluted_y)
    print("Before correction: Theta0_D: {}, theta1_D: {}".format(diluted_clf.intercept_, diluted_clf.coef_))
    corrected_clf = apply_clf_correction(diluted_clf, correction_method, y, diluted_X, diluted_y).fit(diluted_X, diluted_y)

    #Assess the classifier on the original test data    
        
    y_hat = corrected_clf.predict(X_test)
    print(classification_report(y_hat, y_test ))
    probs = corrected_clf.predict_proba(X_test)
    data_container.probs_ax.scatter(X_test, probs[:,1], label=f"0_{zero_dilute_factor}_1_{'tbd'}_{correction_method}")
    data_container.probs_ax.legend()
    print("Theta0_D: {}, theta1_D: {}".format(corrected_clf.intercept_, corrected_clf.coef_))
    print(confusion_matrix(y_test, y_hat, labels=[1,0]))

    

        
    
    
    fig = go.Figure()
    fig.add_trace(go.Histogram(x=diluted_X_zeros.T[0], nbinsx=1000))
    fig.add_trace(go.Histogram(x=diluted_X_ones.T[0], nbinsx=1000))
    fig.show() 


interactive(children=(FloatSlider(value=1.0, description='zero_dilute_factor', max=1.0, min=0.01, step=0.01), …

In [None]:
#Prior correction
y = data_container.y
diluted_y = data_container.diluted_y
tau = sum(y)/len(y)
y_bar =  sum(diluted_y)/len(diluted_y)
print(y_bar)
corrected_intercept = diluted_clf.intercept_ - np.log( (1-tau)/tau*y_bar/(1-y_bar)    )
diluted_clf.intercept_ = corrected_intercept
y_hat_D_corrected = diluted_clf.predict(X_test_D)
print(classification_report(y_hat_D_corrected, y_test_D ))
probs_D_corrected = diluted_clf.predict_proba(X_test_D)
ax1.scatter(X_test_D, probs_D_corrected[:,1])
plt.show()

In [None]:
x = 7
print("x is: {x}")