Remember our data is unbalanced with respect to target values (80%+ of values are existing customers).

I'll manage this problem using oversample technique and compare with the original balance if the model has better results or not. Oversample helps us balancing values by generating copies of the data with less lines.

In [1]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split

In [2]:
dir_in = "C:/Users/barre/Desktop/Machine Learning/projects/bank/in/{}"

In [3]:
df = pd.read_csv(dir_in.format('data_selection.csv'))
df.head(3)

Unnamed: 0,Total_Trans_Ct,Total_Revolving_Bal,Total_Relationship_Count,Months_Inactive_12_mon,Target
0,42,777,5,1,1
1,33,864,6,1,1
2,20,0,4,1,1


In [4]:
# features & target
X = df.drop('Target', axis=1)
y = df['Target']

In [5]:
# amount of values of existing customers
print("Existing Customers")
print((y == 1).value_counts())

Existing Customers
True     8235
False    1561
Name: Target, dtype: int64


In [6]:
# how many values of existing customers we need to obtain a relation of 30-70%
print((8235 / 0.7) - 8235)

3529.2857142857156


In [7]:
ros = RandomOverSampler(sampling_strategy={1: 8235, 0: 3530})

# oversampling
X_ros, y_ros = ros.fit_resample(X, y)

# checking balance
print("---------- Proportion -----------")
print("% Attrited Customers:", len(y_ros[(y_ros == 0)]) / len(y_ros))
print("% Existing Customers: ", len(y_ros[(y_ros == 1)]) / len(y_ros))

---------- Proportion -----------
% Attrited Customers: 0.30004249893752655
% Existing Customers:  0.6999575010624735


# Saving

In [8]:
cols = ['Total_Trans_Ct', 
        'Total_Revolving_Bal', 
        'Total_Relationship_Count', 
        'Months_Inactive_12_mon']

X_ros = pd.DataFrame(X_ros, columns=cols)
y_ros = pd.DataFrame(y_ros, columns=['Target'])

In [9]:
df_ros = pd.concat([X_ros, y_ros], axis=1)
df_ros.to_csv(dir_in.format('data_balanced.csv'), sep=',', index=False)