Data Preprocessing

In [None]:
# necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.linear_model import LogisticRegression
import pickle


In [None]:
# Provide the relative file path to the bank_transaction dataset

file_path = "Churn Modeling.csv"

# Load the dataset
dataset = pd.read_csv(file_path)

# Explore the dataset
print("First few rows of the dataset:")
print(dataset.head())  
print("\nInformation about the dataset:")
print(dataset.info())  
print("\nStatistical summary of the dataset:")
print(dataset.describe()) 

# Histogram of the "Exited" variable
plt.hist(dataset["Exited"])
plt.xlabel("Exited")
plt.ylabel("Count")
plt.title("Distribution of Exited")
plt.show()

# Preprocess the data
cleaned_dataset = dataset.copy() 
cleaned_dataset.drop('RowNumber', axis=1, inplace=True)  # Remove the 'RowNumber' column

# Convert 'Gender' column values to 1 bolian
cleaned_dataset['Gender'] = cleaned_dataset['Gender'].replace({'Female': 1, 'Male': 0})

# Remove columns with null values
cleaned_dataset.dropna(axis=1, inplace=True)


# Select relevant features and target variable
selected_features = cleaned_dataset.drop(["CustomerId", "Surname", "Exited","Geography"], axis=1)  
target_variable = cleaned_dataset["Exited"]  

# Balance the dataset
num_one_targets = int(np.sum(target_variable))
zero_targets_counter = 0
indices_to_remove = []

for i in range(target_variable.shape[0]):
    if target_variable[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

selected_features_balanced = selected_features.drop(indices_to_remove)
target_variable_balanced = target_variable.drop(indices_to_remove)

# Standardize the features using StandardScaler
scaler = StandardScaler()
selected_features_scaled = scaler.fit_transform(selected_features_balanced)

# Shuffle the data
shuffled_indices = np.arange(selected_features_scaled.shape[0])
np.random.shuffle(shuffled_indices)

selected_features_balanced.reset_index(drop=True, inplace=True) # Reset indices
target_variable_balanced.reset_index(drop=True, inplace=True) # Reset indice 

selected_features_shuffled = selected_features_scaled[shuffled_indices]
target_variable_shuffled = target_variable_balanced[shuffled_indices]

# Save the preprocessed datasets
save_path = "preprocessed_datasets.npz"
np.savez(save_path, features=selected_features_shuffled, target=target_variable_shuffled)

print("Preprocessed datasets saved successfully.")



Logistic Regression model

In [None]:

#  performance reporting function
def clf_performance(classifier, model_name):
    print(model_name)
    print('Test Accuracy:', classifier.score(X_test, y_test))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(selected_features_shuffled, target_variable_shuffled, test_size=0.2, random_state=42)

# Create and train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Evaluate the model
clf_performance(logistic_model, "Logistic Regression")


Neuran network model

In [None]:
# Load the preprocessed datasets

# Split the dataset into train, validation, and test sets
samples_count = selected_features_shuffled.shape[0]
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = selected_features_shuffled[:train_samples_count]
train_targets = target_variable_shuffled[:train_samples_count]

validation_inputs = selected_features_shuffled[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = target_variable_shuffled[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = selected_features_shuffled[train_samples_count+validation_samples_count:]
test_targets = target_variable_shuffled[train_samples_count+validation_samples_count:]

print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)



model processing

In [None]:

train_inputs = train_inputs.astype(np.float32)
train_targets = train_targets.astype(np.int32)


validation_inputs = validation_inputs.astype(np.float32)
validation_targets = validation_targets.astype(np.int32)


test_inputs = test_inputs.astype(np.float32)
test_targets = test_targets.astype(np.int32)


# Convert target variable to one-hot encoded format
train_targets_encoded = tf.keras.utils.to_categorical(train_targets)
validation_targets_encoded = tf.keras.utils.to_categorical(validation_targets)
test_targets_encoded = tf.keras.utils.to_categorical(test_targets)


# Model
output_size = 2

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=100, activation='relu', input_shape=(selected_features_shuffled.shape[1],)),
    tf.keras.layers.Dropout(0.5),  # Dropout regularization
    tf.keras.layers.Dense(units=40, activation='relu'),
    tf.keras.layers.Dense(units=output_size, activation='softmax')
])

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Define early stopping criteria
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10)


# Train the model
history = model.fit(train_inputs, train_targets_encoded,
                    validation_data=(validation_inputs, validation_targets_encoded),
                    batch_size=32, epochs=100, callbacks=[early_stopping])

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(test_inputs, test_targets_encoded)

# Print the test loss and accuracy
print(f"Test Loss: {test_loss:.2f}")
print(f"Test Accuracy: {test_accuracy:.2%}")

predictions

In [None]:
predictions = model.predict(test_inputs).round(0)
print(predictions)