# Logistic Regression

In [27]:
# !pip install imblearn

import os
import pandas as pd
import numpy as np

# store elements as dictionary keys and their counts as dictionary values
from collections import Counter

# scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report

# Function for creating model pipelines - sklearn
from sklearn.pipeline import make_pipeline

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE
# Import classifier
from sklearn.linear_model import LogisticRegression 
import datetime as dt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.model_selection import GridSearchCV

## Read proprocessed data

In [3]:
df = pd.read_parquet('df_age_18.parquet')

## Display counts and percentages of target variable

In [4]:
# Display counts and percentages
value_counts = df['Y'].value_counts()
percentages = df['Y'].value_counts(normalize=True) * 100

result_df = pd.DataFrame({'Count': value_counts, 'Percentage': percentages})
print(result_df)

     Count  Percentage
Y                     
1  1147466   65.274965
0   610430   34.725035


## Drop unnecessary columns

In [5]:
df.drop(columns=['#','EBANK_FIRST_OPEN_DATE','CIF_OPEN_DATE',
 'MB2_FIRST_OPEN_DATE',
 'SYM_RUN_DATE'],inplace=True)

## Check null columns

In [6]:
df.isnull().sum()

CLIENT_NO                                  0
CIF_OPEN_CHANNEL                           0
CIF_MOB                                    0
EBANK_FIRST_OPEN_CHANNEL                   0
EBANK_CHANNEL_GROUP                        0
EBANK_MOB                                  0
GENDER                                     0
AGE                                        0
MB2_FIRST_CHANNEL                          0
MB2_CHANNEL_GROUP                          0
EBANK_FLAG                                 0
LOGIN_MB2_DAY_CNT_1M                       0
LOGIN_MB1_DAY_CNT_1M                       0
TRANS_DAY_CNT_1M                           0
QUICKVIEW_MB2_DAY_CNT_1M                   0
ACTIVE_DAY_CNT_1M                          0
CC_FLAG                                    0
CC_SPENDING_AMT_12M                        0
CC_SPENDING_CNT_12M                        0
CC_SPENDING_AMT_POS_12M                    0
CC_SPENDING_AMT_ECOM_12M                   0
CC_SPENDING_AMT_CASH_12M                   0
CC_SPENDIN

## Fill missing values with 0

In [7]:
df.fillna(0,inplace=True)

## Recheck after fill missing values

In [8]:
df.isnull().sum()

CLIENT_NO                            0
CIF_OPEN_CHANNEL                     0
CIF_MOB                              0
EBANK_FIRST_OPEN_CHANNEL             0
EBANK_CHANNEL_GROUP                  0
EBANK_MOB                            0
GENDER                               0
AGE                                  0
MB2_FIRST_CHANNEL                    0
MB2_CHANNEL_GROUP                    0
EBANK_FLAG                           0
LOGIN_MB2_DAY_CNT_1M                 0
LOGIN_MB1_DAY_CNT_1M                 0
TRANS_DAY_CNT_1M                     0
QUICKVIEW_MB2_DAY_CNT_1M             0
ACTIVE_DAY_CNT_1M                    0
CC_FLAG                              0
CC_SPENDING_AMT_12M                  0
CC_SPENDING_CNT_12M                  0
CC_SPENDING_AMT_POS_12M              0
CC_SPENDING_AMT_ECOM_12M             0
CC_SPENDING_AMT_CASH_12M             0
CC_SPENDING_CNT_POS_12M              0
CC_SPENDING_CNT_ECOM_12M             0
CC_SPENDING_CNT_CASH_12M             0
CC_SPENDING_AMT_9M       

## Create a dataframe for correlation calculation

In [9]:
df_corr = df[df.select_dtypes(include='number').columns.tolist()]

## Define function for filter correlation with threhold

In [10]:
# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

In [11]:
corr_features = correlation(df_corr, 0.7)
len(set(corr_features))

87

In [12]:
corr_features

{'ACCOUNT_MANAGEMENT_CNT_1M',
 'ACCOUNT_MANAGEMENT_DAY_CNT_1M',
 'ACCOUNT_MANAGEMENT_DAY_CNT_3M',
 'ACTIVE_DAY_CNT_1M',
 'AR_DAY_CNT_1M',
 'AR_DAY_CNT_3M',
 'CC_SPENDING_AMT_3M',
 'CC_SPENDING_AMT_6M',
 'CC_SPENDING_AMT_9M',
 'CC_SPENDING_AMT_CASH_3M',
 'CC_SPENDING_AMT_CASH_6M',
 'CC_SPENDING_AMT_CASH_9M',
 'CC_SPENDING_AMT_ECOM_3M',
 'CC_SPENDING_AMT_ECOM_6M',
 'CC_SPENDING_AMT_ECOM_9M',
 'CC_SPENDING_AMT_POS_12M',
 'CC_SPENDING_AMT_POS_3M',
 'CC_SPENDING_AMT_POS_6M',
 'CC_SPENDING_AMT_POS_9M',
 'CC_SPENDING_CNT_12M',
 'CC_SPENDING_CNT_3M',
 'CC_SPENDING_CNT_6M',
 'CC_SPENDING_CNT_9M',
 'CC_SPENDING_CNT_CASH_12M',
 'CC_SPENDING_CNT_CASH_3M',
 'CC_SPENDING_CNT_CASH_6M',
 'CC_SPENDING_CNT_CASH_9M',
 'CC_SPENDING_CNT_ECOM_3M',
 'CC_SPENDING_CNT_ECOM_6M',
 'CC_SPENDING_CNT_ECOM_9M',
 'CC_SPENDING_CNT_POS_3M',
 'CC_SPENDING_CNT_POS_6M',
 'CC_SPENDING_CNT_POS_9M',
 'CIF_MOB',
 'CREDIT_CARD_MANAGEMENT_CNT_1M',
 'CREDIT_CARD_MANAGEMENT_DAY_CNT_1M',
 'CREDIT_CARD_MANAGEMENT_DAY_CNT_3M',
 'CRE

## Drop columns with high correlation

In [13]:
df_filtered = df.drop(corr_features,axis=1)

## Save filtered data to parquet file

In [14]:
df_filtered.to_parquet('df_filtered_corr.parquet')

In [28]:
df_filtered = pd.read_parquet('df_filtered_corr.parquet')

## Separate dataframe into separate object

In [29]:
# Object for target variable
y = df_filtered.Y

# object for input features
X = df_filtered.drop(['CLIENT_NO','Y'], axis=1)

# display shapes of X and y
print(X.shape, y.shape)

(1757896, 55) (1757896,)


## List numerical features

In [30]:
num_columns = X.select_dtypes(include='number').columns.tolist()
num_columns

['AGE',
 'EBANK_FLAG',
 'LOGIN_MB2_DAY_CNT_1M',
 'LOGIN_MB1_DAY_CNT_1M',
 'QUICKVIEW_MB2_DAY_CNT_1M',
 'CC_FLAG',
 'CC_SPENDING_AMT_12M',
 'CC_SPENDING_AMT_ECOM_12M',
 'CC_SPENDING_AMT_CASH_12M',
 'CC_SPENDING_CNT_POS_12M',
 'CC_SPENDING_CNT_ECOM_12M',
 'CA_FLAG',
 'TD_FLAG',
 'DIGI_FLAG',
 'CREDIT_TXN_AMT_12M',
 'MB2_TRANS_CNT_3M',
 'MB2_TRANS_AMT_3M',
 'MB2_BP_TRANS_CNT_3M',
 'MB2_BP_TRANS_AMT_3M',
 'MB2_TO_TRANS_CNT_3M',
 'MB2_TO_TRANS_AMT_3M',
 'TOTAL_LOAN_CNT_2023',
 'TOTAL_LOAN_AMT_2023',
 'AUTO_LOAN_AMT_2023',
 'BUSINESS_LOAN_AMT_2023',
 'SECURED_LOAN_AMT_2023',
 'UNSECURED_LOAN_AMT_2023',
 'LOAN_FLAG',
 'AUTO_LOAN_FLAG',
 'MORTGAGE_LOAN_FLAG',
 'BUSINESS_LOAN_FLAG',
 'SECURED_LOAN_FLAG',
 'UNSECURED_LOAN_FLAG',
 'CURBAL_LOANS',
 'VIEW_ACCOUNT_INFO_CNT_3M',
 'ACCOUNT_MANAGEMENT_CNT_3M',
 'LOAN_MANAGEMENT_CNT_3M',
 'CREDIT_CARD_MANAGEMENT_CNT_3M',
 'MORE_USAGE_CNT_3M',
 'SECURITY_CNT_3M',
 'AR_CNT_3M',
 'PIGGY_CNT_3M',
 'VOICEBOT_CNT_3M',
 'VIB_CARE_INSURANCE_CNT_3M',
 'MORE_USAG

## List categorical features

In [31]:
cat_columns = X.select_dtypes(include='object').columns.tolist()
cat_columns

['CIF_OPEN_CHANNEL',
 'EBANK_FIRST_OPEN_CHANNEL',
 'EBANK_CHANNEL_GROUP',
 'GENDER',
 'MB2_FIRST_CHANNEL',
 'MB2_CHANNEL_GROUP']

## Split X and y into train and test sets

In [32]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=random_state,
                                                    stratify=df_filtered.Y)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

1230527 527369 1230527 527369


In [33]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1230527 entries, 1099931 to 313896
Data columns (total 55 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   CIF_OPEN_CHANNEL               1230527 non-null  object 
 1   EBANK_FIRST_OPEN_CHANNEL       1230527 non-null  object 
 2   EBANK_CHANNEL_GROUP            1230527 non-null  object 
 3   GENDER                         1230527 non-null  object 
 4   AGE                            1230527 non-null  float64
 5   MB2_FIRST_CHANNEL              1230527 non-null  object 
 6   MB2_CHANNEL_GROUP              1230527 non-null  object 
 7   EBANK_FLAG                     1230527 non-null  int64  
 8   LOGIN_MB2_DAY_CNT_1M           1230527 non-null  int64  
 9   LOGIN_MB1_DAY_CNT_1M           1230527 non-null  int64  
 10  QUICKVIEW_MB2_DAY_CNT_1M       1230527 non-null  int64  
 11  CC_FLAG                        1230527 non-null  int64  
 12  CC_SPENDING_AM

In [34]:
num_features = []

for i in num_columns:
    location = X.columns.get_loc(i)
    num_features.append(location)
print(num_features)

[4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54]


In [35]:
cat_features = []

for i in cat_columns:
    location = X.columns.get_loc(i)
    cat_features.append(location)
print(cat_features)

[0, 1, 2, 3, 5, 6]


In [52]:
# Define column transformer
# Need to be numeric not string to specify columns name 
preprocess = make_column_transformer(
    (MinMaxScaler(), num_features),
    (OneHotEncoder(sparse=False), cat_features)
)
preprocess

## Fit preprocessing pipeline on train data and apply to train and test data

In [53]:
preprocess.fit(X_train)



In [54]:
# Apply preprocessing pipeline to train and test data
X_train_preprocessed = preprocess.transform(X_train)
X_test_preprocessed = preprocess.transform(X_test)

## Define SMOTE object and resample train data to address class imbalance

In [55]:
# Define SMOTE object
smote = SMOTE(sampling_strategy='auto', random_state=random_state)

In [58]:
# Fit SMOTE on preprocessed train data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

## Save resampled train and test data to parquet files

In [60]:
pd.DataFrame(X_train_resampled).to_parquet("X_train_resampled.parquet",index=False)
pd.DataFrame(y_train_resampled).to_parquet("y_train_resampled.parquet",index=False)
pd.DataFrame(X_test_preprocessed).to_parquet("X_test_preprocessed.parquet",index=False)
pd.DataFrame(y_test).to_parquet("y_test.parquet",index=False)

# Define logistic regression model and fit on preprocessed train data

In [62]:
lr = LogisticRegression(random_state=random_state)
lr.fit(X_train_preprocessed, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Make predictions on test and resampled train data and print classification reports

In [70]:
predictions_test = lr.predict(X_test_preprocessed)
print(f"First 10 Predictions:   {predictions_test[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 1 1 1 1 1 0 1 1]
First 10 Actual labels: [0, 0, 1, 1, 1, 1, 1, 0, 1, 1]


In [71]:
predictions_train = lr.predict(X_train_resampled)
print(f"First 10 Predictions:   {predictions_train[:10]}")
print(f"First 10 Actual labels: {y_train_resampled[:10].tolist()}")

First 10 Predictions:   [1 0 1 0 0 1 1 1 0 1]
First 10 Actual labels: [1, 0, 1, 1, 0, 1, 1, 1, 0, 1]


In [72]:
print(classification_report(y_test, predictions_test))


              precision    recall  f1-score   support

           0       0.95      0.86      0.90    183129
           1       0.93      0.98      0.95    344240

    accuracy                           0.94    527369
   macro avg       0.94      0.92      0.93    527369
weighted avg       0.94      0.94      0.94    527369


In [74]:
print(classification_report(y_train_resampled, predictions_train))


              precision    recall  f1-score   support

           0       0.97      0.86      0.91    803226
           1       0.87      0.98      0.92    803226

    accuracy                           0.92   1606452
   macro avg       0.92      0.92      0.92   1606452
weighted avg       0.92      0.92      0.92   1606452


In [76]:
cm = confusion_matrix(y_test, predictions_test)
print(cm)

[[156992  26137]
 [  7453 336787]]


In [75]:
import joblib
filename = './Models/Logistic_regression.sav'
joblib.dump(lr, filename)

['./Models/Logistic_regression.sav']