In [85]:
from utility import *
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('data/processed/BankChurners_after_preprocessing.csv')

In [86]:
RANDOM_STATE = 5
TARGET_COL_NAME = 'Attrition_Flag'

In [87]:
# Encode the target column
df[TARGET_COL_NAME].replace({'Existing Customer': 1, 'Attrited Customer': 0}, inplace=True)

# Encode boolean columns using astype(int) method
bool_columns = ['Gender_F', 'Gender_M', 'Marital_Status_Divorced', 'Marital_Status_Married', 'Marital_Status_Single']
df[bool_columns] = df[bool_columns].astype(int)

Split the data to training and testing sets

In [88]:
target = df[TARGET_COL_NAME]
df_train = drop_col(df, 'id')
df_train = drop_col(df_train, TARGET_COL_NAME)
X_train, X_test, y_train, y_test = train_test_split(df_train, target, test_size=0.25, random_state=RANDOM_STATE)

### Baseline: Logistic Regression

We'll use logistic regression as a baseline, just like in the laboratories, and utilize the other models to try to enhance the findings provided by the baseline.

In [89]:
model = LogisticRegression(max_iter=1000, tol=0.001)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8874407582938388


In [90]:
# Apply oversampling to the training set using SMOTE
oversampler = SMOTE(random_state=RANDOM_STATE)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

In [91]:
model = LogisticRegression(max_iter=1000, tol=0.001)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8364928909952607


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Multi Layered Perceptron (MLP)

In [64]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix,\
                classification_report, accuracy_score


model_nnet = MLPClassifier(hidden_layer_sizes=[5,5,5],
                           alpha=0.001,
                           activation='logistic',
                           max_iter=800,
                           solver='lbfgs',random_state=RANDOM_STATE)
model_nnet.fit(X_train,y_train)
y_pred = model_nnet.predict(X_train)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8431863067807768


In [65]:
# Scale the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply oversampling to the training set using SMOTE
oversampler = SMOTE(random_state=RANDOM_STATE)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

Results from MLP strongly depend on the setting of its hyperparameters. Since the training does not take too long, we decided to use Grid search for various hyperparameters.

In [73]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
import warnings
from sklearn.exceptions import ConvergenceWarning


# Define the parameter grid for random search
param_grid = {
    'hidden_layer_sizes': [[2], [2, 2], [4, 4], [5, 5], [8, 8], [2, 2, 2], [8, 8, 8]],
    'alpha': uniform(loc=0.001, scale=0.1)
}

# Create an MLPClassifier instance
model_nnet = MLPClassifier(activation='logistic', max_iter=1000, solver='lbfgs', random_state=RANDOM_STATE)

# Perform random search with F1 score as the scoring metric, ignore warning that training stopped on max_iter
with warnings.catch_warnings():
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    random_search = RandomizedSearchCV(model_nnet, param_distributions=param_grid, n_iter=10, random_state=42,
                                       scoring='f1')
    random_search.fit(X_train, y_train)

# Get the best hyperparameters
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Hyperparameters:", best_params)
print("Best F1 Score:", best_score)

Best Hyperparameters: {'alpha': 0.03845401188473625, 'hidden_layer_sizes': [8, 8]}
Best F1 Score: 0.947049271900843


In [75]:
model_nnet = MLPClassifier(hidden_layer_sizes=best_params['hidden_layer_sizes'],
                           alpha=best_params['alpha'],
                           activation='logistic',
                           max_iter=2500,
                           solver='lbfgs',random_state=RANDOM_STATE)
model_nnet.fit(X_train,y_train)
y_pred = model_nnet.predict(X_train)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_train, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9647095565271705


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
# Prepare the data
X = df.drop([TARGET_COL_NAME], axis=1).values
y = df[TARGET_COL_NAME].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=RANDOM_STATE)

# Scale the input features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Apply oversampling to the training set using SMOTE
oversampler = SMOTE(random_state=RANDOM_STATE)
X_train, y_train = oversampler.fit_resample(X_train, y_train)

# Convert the numpy arrays to tensors
X_train = torch.Tensor(X_train)
y_train = torch.Tensor(y_train).view(-1, 1)
X_test = torch.Tensor(X_test)
y_test = torch.Tensor(y_test).view(-1, 1)

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size, 8)
        self.dropout1 = nn.Dropout(0.5)  # Dropout layer with 50% probability
        self.fc2 = nn.Linear(8, 4)
        self.dropout2 = nn.Dropout(0.5)  # Dropout layer with 50% probability
        self.fc3 = nn.Linear(4, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

# Initialize the neural network
input_size = X_train.shape[1]
net = Net(input_size)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=0.0006)

# Train the neural network
num_epochs = 100
batch_size = 32

loss_values = []  # To store the loss values

for epoch in range(num_epochs):
    for i in range(0, X_train.size(0), batch_size):
        inputs = X_train[i:i+batch_size]
        targets = y_train[i:i+batch_size]

        # Forward pass
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Store the loss value
    loss_values.append(loss.item())

    # Print the loss every 10 epochs
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

# Plot the loss values
plt.plot(loss_values)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.show()

# Evaluate the trained model on the test set
with torch.no_grad():
    net.eval()
    y_pred = net(X_test)
    y_pred = torch.round(y_pred)

    accuracy = (y_pred == y_test).sum().item() / y_test.size(0)
    print(f'Test Accuracy: {accuracy}')