# Install the ucimlrepo package

In [23]:
# The following line installs the ucimlrepo package, 
# which is used to fetch datasets from the UCI Machine Learning Repository
# If you have installed the ucimlrepo package, you can skip this line
# !pip install ucimlrepo

# Import the dataset

In [24]:
# Import the pandas library for data manipulation and analysis
import pandas as pd

# https://archive.ics.uci.edu/dataset/222/bank+marketing
# Import the fetch_ucirepo function from the ucimlrepo package
from ucimlrepo import fetch_ucirepo

# Fetch the bank marketing dataset from the UCI repository using its ID
bank_marketing = fetch_ucirepo(id=222)

In [None]:
# Extract the features (independent variables) from the dataset
X = bank_marketing.data.features

# Extract the targets (dependent variable) from the dataset
y = bank_marketing.data.targets

# Get the names of the feature columns
features_names = X.columns

# Get the name of the target column
target_name = y.columns[0]

# Print the shape (number of rows and columns) of the features dataframe
print(f"Shape of features: {X.shape}")

# Print the shape (number of rows and columns) of the targets dataframe
print(f"Shape of targets: {y.shape}")

# Print the names of the feature columns
print(f"Features names: {features_names}")

# Print the name of the target column
print(f"Target name: {target_name}")

# Reset the bank_marketing variable to free up memory
bank_marketing = 0

In [None]:
# Display the first 5 rows of the features dataframe to get an overview of the input data
X.head()

In [None]:
# Display the first 5 rows of the targets dataframe to get an overview of the target data
y.head()

In [None]:
# Count the classes of the target variable
# Loop through each unique class in the target variable
for i in y[target_name].unique():
    # Print the class label and the count of occurrences of that class in the target variable
    print(f"Class {i}: {y[target_name].value_counts()[i]}")

In [None]:
# Count missing values of target variable
missing_values = y.isnull().sum()
print(f"Missing values of target variable: {missing_values}")

# Count the number of missing values in each column
missing_values = X.isnull().sum()
print(missing_values)

In [None]:
# Remove the columns with missing values from the features dataframe
# axis=1 specifies that we are dropping columns (not rows)
# dropna() is a pandas function that removes missing values
# By default, dropna() removes rows with missing values, 
# but setting axis=1 changes it to remove columns instead
X = X.dropna(axis=1)
# Print the shape of the features dataframe after removing the columns with missing values
print(X.shape)

In [None]:
# print the data type of the features
print(X.dtypes)
# print the data type of the target variable
print(y.dtypes)

In [32]:
# Encode the categorical variables using one-hot encoding
# One-hot encoding converts categorical variables into a form that can be provided to ML algorithms to do a better job in prediction.
# It creates new binary columns, each representing a unique category in the original column.
# For example, if a column 'color' has three categories ['red', 'green', 'blue'], one-hot encoding will create three new columns:
# 'color_red', 'color_green', and 'color_blue'. Each row will have a 
# True in the column corresponding to its original category and False in the others.
XcatEncoded = pd.get_dummies(X)

In [None]:
# Print the names of the columns after one-hot encoding
# This will help us understand how the categorical variables have been transformed into binary columns
print(XcatEncoded.columns)

# Display the first 5 rows of the encoded data to get an overview of the transformed features
# This will show us how the original categorical values have been converted into binary columns
XcatEncoded.head()

# Split the data into training, validation, and testing sets

In [34]:
# Import the numpy library for numerical operations
import numpy as np

# Import the train_test_split function from sklearn to split the data into training, validation, and testing sets
from sklearn.model_selection import train_test_split

# Import the DecisionTreeClassifier class from sklearn to create and train a decision tree model
from sklearn.tree import DecisionTreeClassifier

# Import the accuracy_score function from sklearn to evaluate the accuracy of the model
from sklearn.metrics import accuracy_score

In [35]:
# We split the data into a temporary set (X_temp, y_temp) and a test set (X_test, y_test).
# The test set will be 20% of the original data, ensuring that the model's performance is evaluated on unseen data.
X_temp, X_test, y_temp, y_test = train_test_split(XcatEncoded, y, test_size=0.2, random_state=42)

# Train a logistic regression classifier

In [None]:
# Import the LogisticRegression class from sklearn to create and train a logistic regression model
from sklearn.linear_model import LogisticRegression

# Create an instance of the LogisticRegression class
clf = LogisticRegression(max_iter=1000, random_state=42)

# Train the logistic regression model on the temporary training set
clf.fit(X_temp, y_temp)

In [None]:
# Import additional metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Predict the labels for the training set
y_train_pred = clf.predict(X_temp)

# Calculate the accuracy of the model on the training set
train_accuracy = accuracy_score(y_temp, y_train_pred)

# Calculate the precision of the model on the training set
train_precision = precision_score(y_temp, y_train_pred, pos_label='yes')

# Calculate the recall of the model on the training set
train_recall = recall_score(y_temp, y_train_pred, pos_label='yes')

# Calculate the F1-score of the model on the training set
train_f1 = f1_score(y_temp, y_train_pred, pos_label='yes')

# Calculate the ROC AUC score of the model on the training set
y_train_prob = clf.predict_proba(X_temp)[:, 1]
train_roc_auc = roc_auc_score(y_temp, y_train_prob)

# Print all metrics for the training set
print(f"Training Set Accuracy: {train_accuracy:.4f}")
print(f"Training Set Precision: {train_precision:.4f}")
print(f"Training Set Recall: {train_recall:.4f}")
print(f"Training Set F1 Score: {train_f1:.4f}")
print(f"Training Set ROC AUC: {train_roc_auc:.4f}")

In [None]:
# Import the confusion_matrix function and matplotlib for plotting
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix for the training set
train_confusion_matrix = confusion_matrix(y_temp, y_train_pred)

# Plot the confusion matrix for the training set
plt.figure(figsize=(3, 3))
# Plot the confusion matrix for the training set using seaborn heatmap
sns.heatmap(train_confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')  # Label for the x-axis
plt.ylabel('Actual')  # Label for the y-axis
plt.title('Training Set Confusion Matrix')  # Title of the plot
plt.show()  # Display the plot


In [None]:
# Predict the labels for the test set
y_test_pred = clf.predict(X_test)

# Calculate the accuracy of the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)

# Calculate the precision of the model on the test set
test_precision = precision_score(y_test, y_test_pred, pos_label='yes')

# Calculate the recall of the model on the test set
test_recall = recall_score(y_test, y_test_pred, pos_label='yes')

# Calculate the F1-score of the model on the test set
test_f1 = f1_score(y_test, y_test_pred, pos_label='yes')

# Calculate the ROC AUC score of the model on the test set
y_test_prob = clf.predict_proba(X_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_prob)

# Print all metrics for the test set
print(f"Test Set Accuracy: {test_accuracy:.4f}")
print(f"Test Set Precision: {test_precision:.4f}")
print(f"Test Set Recall: {test_recall:.4f}")
print(f"Test Set F1 Score: {test_f1:.4f}")
print(f"Test Set ROC AUC: {test_roc_auc:.4f}")

In [None]:
# Calculate the confusion matrix for the test set
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)

# Plot the confusion matrix for the test set
plt.figure(figsize=(3, 3))
# Plot the confusion matrix for the test set using seaborn heatmap
sns.heatmap(test_confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')  # Label for the x-axis
plt.ylabel('Actual')  # Label for the y-axis
plt.title('Test Set Confusion Matrix')  # Title of the plot
plt.show()  # Display the plot