In [10]:
import pandas as pd

In [11]:
# Uncomment the following line to install the ucimlrepo package
# !pip install ucimlrepo

In [None]:
# Import dataset
# https://archive.ics.uci.edu/dataset/222/bank+marketing
from ucimlrepo import fetch_ucirepo
bank_marketing = fetch_ucirepo(id=222)
X = bank_marketing.data.features
y = bank_marketing.data.targets
features_names = X.columns
target_name = y.columns[0]
print(f"Shape of features: {X.shape}")
print(f"Shape of targets: {y.shape}")
print(f"Features names: {features_names}")
print(f"Target name: {target_name}")
bank_marketing = 0

In [None]:
X.head()

In [None]:
y.head()

In [None]:
# count the classes of the target variable
for i in y[target_name].unique():
    print(f"Class {i}: {y[target_name].value_counts()[i]}")

In [None]:
# Count missing values of target variable
missing_values = y.isnull().sum()
print(f"Missing values of target variable: {missing_values}")

# Count the number of missing values in each column
missing_values = X.isnull().sum()
print(missing_values)

In [None]:
# Remove the columns with missing values
X = X.dropna(axis=1)
print(X.shape)
print(y.shape)

In [None]:
# print the type of the features
print(X.dtypes)
# print the type of the target variable
print(y.dtypes)

In [None]:
# Encode the categorical variables using one-hot encoding
XcatEncoded = pd.get_dummies(X)
print(XcatEncoded.columns)
# Display the encoded data
XcatEncoded.head()

# Split the data into training, validation, and testing sets

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
# Split the data into training, validation, and testing sets
X_temp, X_test, y_temp, y_test = train_test_split(XcatEncoded, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Train a decision tree classifier

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# Create a decision tree classifier
# max_depth is the maximum depth of the tree, it varies from 1 to infinity (default=None)
# min_samples_leaf is the minimum number of samples required to be at a leaf node, it varies from 1 to infinity (default=2)
# ccp_alpha is the complexity parameter for the cost-complexity pruning, it varies from 0.0 to infinity (default=0.0)
# criterion is the function used to measure the quality of a split, it varies between 'gini', 'entropy', 'log_loss' (default='gini')

clf = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_leaf=2, ccp_alpha=0.0)

clf.fit(X_train, y_train)

# Make predictions on the training set, and evaluate the classifier
y_pred = clf.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.2f}")

# Make predictions on the validation set, and evaluate the classifier
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")


# Make predictions on the test set, and evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

In [None]:
print(f"Depth of the tree: {clf.get_depth()}")
print(f"Min samples leaf: {clf.min_samples_leaf}")
print(f"Ccp alpha: {clf.ccp_alpha}")
print(f"Criterion: {clf.criterion}")
print(f"Number of leaves of the tree: {clf.get_n_leaves()}")

In [None]:
# Visualize the decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
if clf.get_n_leaves() < 10:
    plt.figure(figsize=(20,10))
    plot_tree(clf, filled=True, feature_names=XcatEncoded.columns, 
            class_names=np.unique(y.values), fontsize=10)
    plt.show()
else:
    print("The tree is too large to visualize, number of leaves: ", clf.get_n_leaves())

In [None]:
from sklearn.tree import export_text
tree_rules = export_text(clf, feature_names=list(XcatEncoded.columns))
print(tree_rules)

# Cross validation

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
NOF_FOLDS = 5 # Number of folds
PERCENTAGE_TEST = 0.2 # Percentage of test set

In [None]:
# Percentage of classes in the target variable
print(f"Number of instances in the target variable: {y.value_counts()}")
print(f"Percentage of classes in the target variable: {y.value_counts(normalize=True)}")

In [28]:
# First, split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(XcatEncoded, 
                                                    y, 
                                                    test_size=PERCENTAGE_TEST, 
                                                    random_state=42)

In [None]:
# Then, perform cross-validation on the training set
from sklearn.model_selection import KFold
kf = KFold(n_splits=NOF_FOLDS, shuffle=True, random_state=42)
average_accuracy_train = 0
average_accuracy_val = 0
for fold_index, (train_index, val_index) in enumerate(kf.split(X_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Print the number of instances per class in the training and validation folds
    print(f"Number of instances per class in the training fold: {y_train_fold.value_counts()}")
    print(f"Percentage of instances per class in the training fold: {y_train_fold.value_counts(normalize=True)}")
    print(f"Number of instances per class in the validation fold: {y_val_fold.value_counts()}")
    print(f"Percentage of instances per class in the validation fold: {y_val_fold.value_counts(normalize=True)}")

    clf = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=2, ccp_alpha=0.01)
    clf.fit(X_train_fold, y_train_fold)
    y_pred_train = clf.predict(X_train_fold)
    accuracy_train = accuracy_score(y_train_fold, y_pred_train)
    print(f"Training Accuracy: {accuracy_train:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    y_pred_val = clf.predict(X_val_fold)
    accuracy_val = accuracy_score(y_val_fold, y_pred_val)
    print(f"Validation Accuracy: {accuracy_val:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    print(100*"-")

    average_accuracy_train += accuracy_train
    average_accuracy_val += accuracy_val

print(100*"-")
average_accuracy_train /= NOF_FOLDS
print(f"Average Training Accuracy: {average_accuracy_train:.5f}")
average_accuracy_val /= NOF_FOLDS
print(f"Average Cross-validation Accuracy: {average_accuracy_val:.5f}")

# Stratified Cross validation

In [48]:
NOF_FOLDS = 5
PERCENTAGE_TEST = 0.2

In [30]:
# First, split the data into train and test sets, using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(XcatEncoded, 
                                                    y, 
                                                    test_size=PERCENTAGE_TEST, 
                                                    random_state=42,
                                                    stratify=y)

In [None]:
# Percentage of classes in the target variable
print(f"Number of instances in the training set: {y_train.value_counts()}")
print(f"Percentage of classes in the training set: {y_train.value_counts(normalize=True)}")
print(f"Number of instances in the test set: {y_test.value_counts()}")
print(f"Percentage of classes in the test set: {y_test.value_counts(normalize=True)}")

In [None]:
# Then, perform stratified cross-validation on the training set
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=NOF_FOLDS, shuffle=True, random_state=42)
average_accuracy_train = 0
average_accuracy_val = 0
for fold_index, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Print the number of instances per class in the training and validation folds
    print(f"Number of instances per class in the training fold: {y_train_fold.value_counts()}")
    print(f"Number of instances per class in the validation fold: {y_val_fold.value_counts()}")

    clf = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=2, ccp_alpha=0.01)
    clf.fit(X_train_fold, y_train_fold)
    y_pred_train = clf.predict(X_train_fold)
    accuracy_train = accuracy_score(y_train_fold, y_pred_train)
    print(f"Training Accuracy: {accuracy_train:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    y_pred_val = clf.predict(X_val_fold)
    accuracy_val = accuracy_score(y_val_fold, y_pred_val)
    print(f"Validation Accuracy: {accuracy_val:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    print(100*"-")

    average_accuracy_train += accuracy_train
    average_accuracy_val += accuracy_val

print(100*"-")
average_accuracy_train /= NOF_FOLDS
print(f"Average Training Accuracy: {average_accuracy_train:.5f}")
average_accuracy_val /= NOF_FOLDS
print(f"Average Cross-validation Accuracy: {average_accuracy_val:.5f}")