# Install the ucimlrepo package

In [None]:
# The following line installs the ucimlrepo package, 
# which is used to fetch datasets from the UCI Machine Learning Repository
# If you have installed the ucimlrepo package, you can skip this line
# !pip install ucimlrepo

# Import the dataset

In [2]:
# Import the pandas library for data manipulation and analysis
import pandas as pd

# https://archive.ics.uci.edu/dataset/222/bank+marketing
# Import the fetch_ucirepo function from the ucimlrepo package
from ucimlrepo import fetch_ucirepo

# Fetch the bank marketing dataset from the UCI repository using its ID
bank_marketing = fetch_ucirepo(id=222)

In [None]:
# Extract the features (independent variables) from the dataset
X = bank_marketing.data.features

# Extract the targets (dependent variable) from the dataset
y = bank_marketing.data.targets

# Get the names of the feature columns
features_names = X.columns

# Get the name of the target column
target_name = y.columns[0]

# Print the shape (number of rows and columns) of the features dataframe
print(f"Shape of features: {X.shape}")

# Print the shape (number of rows and columns) of the targets dataframe
print(f"Shape of targets: {y.shape}")

# Print the names of the feature columns
print(f"Features names: {features_names}")

# Print the name of the target column
print(f"Target name: {target_name}")

# Reset the bank_marketing variable to free up memory
bank_marketing = 0

In [None]:
# Display the first 5 rows of the features dataframe to get an overview of the input data
X.head()

In [None]:
# Display the first 5 rows of the targets dataframe to get an overview of the target data
y.head()

In [None]:
# Count the classes of the target variable
# Loop through each unique class in the target variable
for i in y[target_name].unique():
    # Print the class label and the count of occurrences of that class in the target variable
    print(f"Class {i}: {y[target_name].value_counts()[i]}")

In [None]:
# Count missing values of target variable
missing_values = y.isnull().sum()
print(f"Missing values of target variable: {missing_values}")

# Count the number of missing values in each column
missing_values = X.isnull().sum()
print(missing_values)

In [None]:
# Remove the columns with missing values from the features dataframe
# axis=1 specifies that we are dropping columns (not rows)
# dropna() is a pandas function that removes missing values
# By default, dropna() removes rows with missing values, 
# but setting axis=1 changes it to remove columns instead
X = X.dropna(axis=1)
# Print the shape of the features dataframe after removing the columns with missing values
print(X.shape)

In [None]:
# print the data type of the features
print(X.dtypes)
# print the data type of the target variable
print(y.dtypes)

In [10]:
# Encode the categorical variables using one-hot encoding
# One-hot encoding converts categorical variables into a form that can be provided to ML algorithms to do a better job in prediction.
# It creates new binary columns, each representing a unique category in the original column.
# For example, if a column 'color' has three categories ['red', 'green', 'blue'], one-hot encoding will create three new columns:
# 'color_red', 'color_green', and 'color_blue'. Each row will have a 
# True in the column corresponding to its original category and False in the others.
X = pd.get_dummies(X)

In [None]:
# Print the names of the columns after one-hot encoding
# This will help us understand how the categorical variables have been transformed into binary columns
print(X.columns)

# Display the first 5 rows of the encoded data to get an overview of the transformed features
# This will show us how the original categorical values have been converted into binary columns
X.head()