In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('./Model Data Set (pseudo).csv',sep=';')
df.head()

Unnamed: 0,CREDIT_UNION_ID,ACCOUNT_NUM,FLG_202201,FLG_202202,FLG_202203,FLG_202204,FLG_202205,FLG_202206,DEEP_CHANNELS_202201,DEEP_CHANNELS_202202,...,AMP_202203,AMP_202204,AMP_202205,AMP_202206,NUM_TRANSACTIONS_202201,NUM_TRANSACTIONS_202202,NUM_TRANSACTIONS_202203,NUM_TRANSACTIONS_202204,NUM_TRANSACTIONS_202205,NUM_TRANSACTIONS_202206
0,A,ZWZZ!W,1,1,1,1,1,1,0,0,...,3,3,2,4,68,86,130,100,68,112
1,A,&WXYY&,1,1,1,1,1,1,0,0,...,1,2,2,1,14,8,4,24,12,10
2,A,Y%@YZ&,1,1,1,1,1,1,0,0,...,2,2,1,2,12,6,10,12,12,20
3,A,!W%&#!,1,1,1,1,1,1,0,0,...,3,3,2,2,38,24,36,54,72,48
4,A,%##AXY,1,1,1,1,1,1,0,0,...,3,2,3,2,24,12,12,8,6,20


In [3]:
df.shape

(91848, 68)

In [4]:
# We make sure to create a copy of the data before we start altering it. Note that we don't change the original data we loaded.
data = df.copy(deep=False)

# Preparing Variables

In [5]:
#Declare independent variables (X) and dependent variable (y)

# To avoid writing them out every time, we save the names of the estimators of our model in a list. 
independent_variables=[#PIX
            'DEEP_PIX_202201',
            'DEEP_PIX_202202',
            'DEEP_PIX_202203',
            'DEEP_PIX_202204',
            'DEEP_PIX_202205',
            #BILLS
            'DEEP_BILLS_202201',
            'DEEP_BILLS_202202',
            'DEEP_BILLS_202203',
            'DEEP_BILLS_202204',
            'DEEP_BILLS_202205',
            #CARDS
            'DEEP_CARDS_202201',
            'DEEP_CARDS_202202',
            'DEEP_CARDS_202203',
            'DEEP_CARDS_202204',
            'DEEP_CARDS_202205',
            #CHECKING
            'DEEP_CHECKING_202201',
            'DEEP_CHECKING_202202',
            'DEEP_CHECKING_202203',
            'DEEP_CHECKING_202204',
            'DEEP_CHECKING_202205',
            #CREDIT
            'DEEP_CREDIT_202201',
            'DEEP_CREDIT_202202',
            'DEEP_CREDIT_202203',
            'DEEP_CREDIT_202204',
            'DEEP_CREDIT_202205',
            #INVESTMENTS
            'DEEP_INVESTMENTS_202201',
            'DEEP_INVESTMENTS_202202',
            'DEEP_INVESTMENTS_202203',
            'DEEP_INVESTMENTS_202204',
            'DEEP_INVESTMENTS_202205',
            #PAYMENTS
            'DEEP_PAYMENTS_202201',
            'DEEP_PAYMENTS_202202',
            'DEEP_PAYMENTS_202203',
            'DEEP_PAYMENTS_202204',
            'DEEP_PAYMENTS_202205',
            #AMPLITUDE
            'AMP_202201',
            'AMP_202202',
            'AMP_202203',
            'AMP_202204',
            'AMP_202205'
           ]

X = data[independent_variables]
y = data['FLG_202206']

# Handling class imbalance

We know from our exploratory analysis that this dataset will be havily imbalanced with churn on 6th month as the minority class (represented as inactivity on that month or FLG_202206 = 0).

The problem with classifiers and class inbalance is that the classifier will more easily classify the majority class, simply because most cases are of that class. For that reason model performance metrics have to be carefully selected. Precision, recall and F1 will be used as the main metrics for evaluating performance. In our specfic case we our most interested in those metrics regarding the prediction of the minority class (0 in our case).

So in this study we will contrast the use of two wildly used classification models: Logistic Regression and RandomTreeClassifier, both with SciKit Learn implementations. Tree Ensembles our suposabily better at handling inbalance. And a common technique for getting better results is using resampling techniques. For that we will contrast model metrics on baseline models with resampled models (RandomOverSampling, SMOTE and NearMisses)


Reference:

https://medium.com/grabngoinfo/four-oversampling-and-under-sampling-methods-for-imbalanced-classification-using-python-7304aedf9037

https://towardsdatascience.com/a-look-at-precision-recall-and-f1-score-36b5fd0dd3ec

https://www.analyticsvidhya.com/blog/2020/10/overcoming-class-imbalance-using-smote-techniques/

In [6]:
# Quick Primer on reading the Confusion Matrix
# True Negative | False Positive
# False Negative | True Positive

# Precision 
# Measure of how many of the positive predictions made are correct (true positives).
# Formula: TP/(TP+FP)

# Recall 
# Measure of how many of the positive cases the classifier correctly predicted considering the over all positive cases in the data.
# It is sometimes also referred to as Sensitivity
# Formula: TP/(TP+FN)

# Accuracy
# Measure of the number of correct predictions over all predictions
# Formula: (TP+TN)/(TP+TN+FP+FN)

In [7]:
# Creating the modeling dataset
from sklearn.datasets import make_classification
# Data processing
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Model and performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve 
# Oversampling and under sampling
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from collections import Counter
# Processing time
import time

In [27]:
def model_performance(model_name,resampling_name,y_test,model_prediction,verbose):
    """
    Prints the performance reports: Classification report and Confusion Matrix
    
    Inputs
    model_name = Str with model name
    resampling_name = Str with resampling method
    y_test = Test vector
    model_prediction = Prediction vector
    verbose = STR to switch execution log on or off
    
    Returns print with the reports
    """
    start_time = time.time() #Count processing time
    cr = classification_report(y_test, model_prediction)
    cm = confusion_matrix(y_test, model_prediction)
    print('\n',model_name,'with',resampling_name,' Classification Report:')
    print(cr)
    print(cm)
    
    if verbose != 'off': print("\nModel Performane processing time: --- %s seconds ---" % (time.time() - start_time))
    return cr,cm
    
    # #Precision-Recall Curve gives us the correct accuracy in this imbalanced dataset case. We can see that we have a very poor accuracy for the model.
    # precision, recall, thresholds = precision_recall_curve(model_prediction, y_test)

    # # create plot
    # plt.plot(precision, recall, label='Precision-recall curve')
    # plt.xlabel('Precision')
    # plt.ylabel('Recall')
    # plt.title('Precision-recall curve')
    # plt.legend(loc="lower left")

In [26]:
def define_model(model_name,random_state,verbose):
    """
    Instantiates and defines Classifier models.
    
    INPUT
    model_name = Str with model name
    random_state = INT random state number
    verbose = STR to switch execution log on or off
    
    OUTPUT
    Returns intantiated model according to users choice.
    """
    start_time = time.time() #Count processing time
    if verbose != 'off': print('\nInstantiating',model_name,'model.')
    rf = RandomForestClassifier(random_state = random_state)
    lr =  LogisticRegression(random_state = random_state)
    if model_name == 'Random Forest':
        if verbose != 'off': print('\nModel ready:',rf)
        if verbose != 'off': print("Model Instatiating processing time: --- %s seconds ---" % (time.time() - start_time))
        return rf
    elif model_name == 'Logistic Regression':
        if verbose != 'off': print('\nModel ready:',lr)
        if verbose != 'off': print("Model Instatiating processing time: --- %s seconds ---" % (time.time() - start_time))
        return lr
    else:
        print('\nNo compatible model.')

In [35]:
def split_resample_sets(model_name,resampling_name,X,y,test_size,random_state,verbose):
    """
    Splits and resamples X dataset and y vector.
    
    INPUT
    model_name = Str with model name
    resampling_name = Str with resampling method
    X = DataFrame with independent variables
    y = Vector with dependent (response) variable
    test_size = Float (0 to 1) for test size porcentage of the train/test split
    random_state = INT random state number
    verbose = STR to switch execution log on or off
    
    OUTPUT
    X_train = DataFrame with independent variables splitted for train set
    X_test = DataFrame with independent variables splitted for test set
    y_train = Vector with dependent variable splitted for train set
    y_test = Vector with dependent variable splitted for test set
    """
    start_time = time.time() #Count processing time
    if verbose != 'off': print('\nIniatialing split for train and test sets. \nAnalyzing need for variable rescaling.')
    if model_name == 'Logistic Regression':
        if verbose != 'off': print('\nLogistic Regression requies scaling. \nVariable rescaling necessary.')
        scaler = StandardScaler().fit(X)
        X_scaled = scaler.transform(X)
        X_scaled

        # Split into train and test
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y,test_size = test_size, random_state = random_state)
    elif model_name == 'Random Forest':
        if verbose != 'off': print('\nRandom Forest does not require rescalling.')
        # Split into train and test
        X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = test_size, random_state = random_state)
    
    if verbose != 'off': print('\nApplying resampling technique choosen.')
    if resampling_name == 'Baseline':
        if verbose != 'off': print("\nBaseline doesn't require resampling.")
    elif resampling_name == 'Random Over Sampling':
        if verbose != 'off': print('\nApplying',resampling_name)
        resampler = RandomOverSampler(random_state=random_state)
        X_train, y_train= resampler.fit_resample(X_train, y_train)
    elif resampling_name == 'SMOTE':
        if verbose != 'off': print('\nApplying',resampling_name)
        resampler = SMOTE(random_state=random_state)
        X_train, y_train= resampler.fit_resample(X_train, y_train)
    elif resampling_name == 'NearMiss KNN':
        if verbose != 'off': print('\nApplying',resampling_name)
        resampler = NearMiss(version=3,random_state=random_state)
        X_train, y_train= resampler.fit_resample(X_train, y_train)
    elif resampling_name == 'Random Under Sampling':
        if verbose != 'off': print('\nApplying',resampling_name)
        resampler = RandomUnderSampler(random_state=random_state)
        X_train, y_train= resampler.fit_resample(X_train, y_train)
    
    if verbose != 'off': print("\nResampling processing time: --- %s seconds ---" % (time.time() - start_time))
    return X_train, X_test, y_train, y_test

In [33]:
def model_predict(model_name,resampling_name,X,y,random_state,test_size,verbose):
    """
    Trains and Fits Models with different Resampling Techniques
    
    INPUT
    model_name = Str with model name
    resampling_name = Str with resampling technique name
    X = DataFrame with independent variables
    y = Vector with dependent (response) variable
    random_state = INT random state number
    verbose = STR to switch execution log on or off
    
    OUTPUT
    model_prediction = Vector with chosen model with resampling predictions on the test set
    """
    start_time = time.time() #Count processing time
    print('\n--------------------------------------------------------------------------------\n--------------------------------------------------------------------------------')
    if verbose != 'off': print('\nStarting new sequence:',model_name,'with',resampling_name)
    model_prediction = []
    y_train_resampled = []  
 
    
    #Define model
    model = define_model(model_name,random_state,verbose)
    #Split train and test sets + Apply scaling when need + Apply resampling
    X_train, X_test, y_train, y_test = split_resample_sets(model_name,resampling_name,X,y,test_size,random_state,verbose)
    #Train model
    if verbose != 'off': print('\nFitting model.')
    model = model.fit(X_train, y_train)
    if verbose != 'off': print("\nFitting model processing time: --- %s seconds ---" % (time.time() - start_time))
    #Predict on trained model
    if verbose != 'off': print('\nPredicting on model.')
    model_prediction = model.predict(X_test) 
    if verbose != 'off': print("\nModel prediction processing time: --- %s seconds ---" % (time.time() - start_time))
    #Evaluate model performance   
    cr,cm = model_performance(model_name,resampling_name,y_test,model_prediction,verbose)
    
    print("\nTotal processing time: --- %s seconds ---" % (time.time() - start_time))
    
    return cr,cm

## Processing and evaluating models

In [36]:
#set shared variables
random_state = 26
test_size = 0.15
#set model names
model_name = ['Random Forest','Logistic Regression']
#set resampling names
resampling_name = ['Baseline','Random Over Sampling','SMOTE','Near Miss KNN','Random Under Sampling']

model_prediction = []

for model in model_name:
    for resampling in resampling_name:
        cr,cm = model_predict(model,resampling,X,y,random_state,test_size,'off')
        model_prediction.append((model,resampling,cr,cm))


--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

 Random Forest with Baseline  Classification Report:
              precision    recall  f1-score   support

           0       0.07      0.00      0.00       423
           1       0.97      1.00      0.98     13355

    accuracy                           0.97     13778
   macro avg       0.52      0.50      0.49     13778
weighted avg       0.94      0.97      0.95     13778

[[    1   422]
 [   14 13341]]

Total processing time: --- 15.233237028121948 seconds ---

--------------------------------------------------------------------------------
--------------------------------------------------------------------------------

 Random Forest with Random Over Sampling  Classification Report:
              precision    recall  f1-score   support

           0       0.09      0.02      0.03       423
           1       0.97     