In [30]:
X=y=0 

In [None]:
# This code is downloading the notebook from GitHub and running it
import requests
from pathlib import Path
url = "https://github.com/nbakas/MachineLearning/blob/main/08-BankingDataset.ipynb"
filename = url.split("/")[-1]
local_path = Path.cwd() / filename
if not local_path.exists():
    response = requests.get(url)
    response.raise_for_status()
    local_path.write_bytes(response.content)
%run 08-BankingDataset.ipynb

# Split the data into training, validation, and testing sets

In [32]:
# Import the numpy library for numerical operations
import numpy as np

# Import the train_test_split function from sklearn to split the data into training, validation, and testing sets
from sklearn.model_selection import train_test_split

# Import the DecisionTreeClassifier class from sklearn to create and train a decision tree model
from sklearn.tree import DecisionTreeClassifier

# Import the accuracy_score function from sklearn to evaluate the accuracy of the model
from sklearn.metrics import accuracy_score

In [33]:
# First, we split the data into a temporary set (X_temp, y_temp) and a test set (X_test, y_test).
# The test set will be 20% of the original data, ensuring that the model's performance is evaluated on unseen data.
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Next, we split the temporary set (X_temp, y_temp) into training (X_train, y_train) and validation sets (X_val, y_val).
# The validation set will be 25% of the temporary set, which corresponds to 20% of the original data.
# This ensures that the training set is 60% of the original data, and the validation set is 20% of the original data.
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Train a decision tree classifier

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
# Create a decision tree classifier
# max_depth is the maximum depth of the tree, it varies from 1 to infinity (default=None)
# min_samples_leaf is the minimum number of samples required to be at a leaf node, it varies from 1 to infinity (default=2)
# ccp_alpha is the complexity parameter for the cost-complexity pruning, it varies from 0.0 to infinity (default=0.0)
# criterion is the function used to measure the quality of a split, it varies between 'gini', 'entropy', 'log_loss' (default='gini')

clf = DecisionTreeClassifier(criterion='gini', max_depth=2, min_samples_leaf=2, ccp_alpha=0.0)

clf.fit(X_train, y_train)

# Make predictions on the training set, and evaluate the classifier
y_pred = clf.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Training Accuracy: {accuracy:.2f}")

# Make predictions on the validation set, and evaluate the classifier
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.2f}")


# Make predictions on the test set, and evaluate the classifier
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

In [None]:
# Print the depth of the tree
print(f"Depth of the tree: {clf.get_depth()}")

# Print the minimum number of samples required to be at a leaf node
print(f"Min samples leaf: {clf.min_samples_leaf}")

# Print the complexity parameter for the cost-complexity pruning
print(f"Ccp alpha: {clf.ccp_alpha}")

# Print the function used to measure the quality of a split
print(f"Criterion: {clf.criterion}")

# Print the number of leaves of the tree
print(f"Number of leaves of the tree: {clf.get_n_leaves()}")

In [None]:
# Import necessary libraries for visualization
from sklearn.tree import plot_tree  # plot_tree is used to visualize the decision tree
import matplotlib.pyplot as plt  # matplotlib.pyplot is used for creating static, animated, and interactive visualizations in Python

# Check if the number of leaves in the decision tree is less than 10, to avoid plotting a large tree
if clf.get_n_leaves() < 10:
    # Set the size of the figure for better visualization
    plt.figure(figsize=(20,10))
    
    # Plot the decision tree with filled nodes, feature names, class names, and specified font size
    plot_tree(clf,  # Plot the decision tree
              filled=True,  # Fill the nodes with colors
              feature_names=X.columns,  # Use the column names of the encoded features
              class_names=np.unique(y.values),  # Use the unique values of the target variable as class names
              fontsize=10)  # Set the font size for the text in the plot
    
    # Display the plot
    plt.show()
else:
    # Print a message indicating that the tree is too large to visualize
    print("The tree is too large to visualize, number of leaves: ", clf.get_n_leaves())

In [None]:
# Import the export_text function from sklearn.tree to export the decision tree rules as text
from sklearn.tree import export_text

# Export the decision tree rules as text, using the feature names from the encoded columns
tree_rules = export_text(clf, feature_names=list(X.columns))

# Print the exported decision tree rules
print(tree_rules)

# Cross validation

In [39]:
NOF_FOLDS = 5 # Number of folds
PERCENTAGE_TEST = 0.2 # Percentage of test set

In [None]:
# Percentage of classes in the target variable
print(f"Number of instances in the target variable: {y.value_counts()}")
# Print the percentage of each class in the target variable
# The normalize=True parameter in value_counts() calculates the relative frequencies of each class
print(f"Percentage of classes in the target variable: {y.value_counts(normalize=True)}")

In [41]:
# First, split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,  # Split the data into training and testing sets
                                                    y,  # The target variable
                                                    test_size=PERCENTAGE_TEST,  # The proportion of the dataset to include in the test split
                                                    random_state=42)  # Random state for reproducibility, so that if we run the code again, we will get the same result

In [42]:
# Import KFold from sklearn.model_selection for cross-validation
from sklearn.model_selection import KFold

In [43]:
# Initialize KFold with the number of splits, shuffle, and random state
kf = KFold(n_splits=NOF_FOLDS,  # Initialize KFold with the number of splits
           shuffle=True,  # Shuffle the data before splitting into batches
           random_state=42)  # Random state for reproducibility

In [None]:
# Initialize variables to accumulate accuracy scores
average_accuracy_train = 0
average_accuracy_val = 0

# Loop over each fold in the KFold split
all_val_indices = []
for fold_index, (train_index, val_index) in enumerate(kf.split(X_train)):
    # Split the data into training and validation sets for the current fold
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Print the number and percentage of instances per class in the training fold
    print(f"Number of instances per class in the training fold: {y_train_fold.value_counts()}")
    print(f"Percentage of instances per class in the training fold: {y_train_fold.value_counts(normalize=True)}")

    # Print the number and percentage of instances per class in the validation fold
    print(f"Number of instances per class in the validation fold: {y_val_fold.value_counts()}")
    print(f"Percentage of instances per class in the validation fold: {y_val_fold.value_counts(normalize=True)}")

    # Initialize and train the DecisionTreeClassifier on the training fold
    clf = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=2, ccp_alpha=0.01)
    clf.fit(X_train_fold, y_train_fold)

    # Predict and calculate accuracy for the training fold
    y_pred_train = clf.predict(X_train_fold)
    accuracy_train = accuracy_score(y_train_fold, y_pred_train)
    print(f"Training Accuracy: {accuracy_train:.5f}, fold {fold_index+1} of {NOF_FOLDS}")

    # Predict and calculate accuracy for the validation fold
    y_pred_val = clf.predict(X_val_fold)
    accuracy_val = accuracy_score(y_val_fold, y_pred_val)
    print(f"Validation Accuracy: {accuracy_val:.5f}, fold {fold_index+1} of {NOF_FOLDS}")

    # Accumulate the accuracy scores for averaging
    average_accuracy_train += accuracy_train
    average_accuracy_val += accuracy_val

    all_val_indices.extend(val_index.tolist())  # Append the validation indices to the list

    # Print a separator for readability
    print(100*"-")

# Print a separator for readability
print(100*"-")
# Print the first 10 validation indices after sorting them
print(sorted(all_val_indices)[:10])
print(100*"-")
# Print the total number of validation indices and the number of instances in the training set, to check if they are the same
print(f"len(all_val_indices)={len(all_val_indices)}, X_train.shape[0]={X_train.shape[0]}")
print(100*"-")

# Calculate and print the average training accuracy over all folds
average_accuracy_train /= NOF_FOLDS
print(f"Average Training Accuracy: {average_accuracy_train:.5f}")

# Calculate and print the average validation accuracy over all folds
average_accuracy_val /= NOF_FOLDS
print(f"Average Cross-validation Accuracy: {average_accuracy_val:.5f}")

# Stratified Cross validation

In [45]:
# Number of folds for cross-validation
NOF_FOLDS = 5

# Percentage of data to be used for testing
PERCENTAGE_TEST = 0.2

In [46]:
# First, split the data into train and test sets, using stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X,  # Features to be split
                                                    y,  # Target variable
                                                    test_size=PERCENTAGE_TEST,  # Proportion of data for testing
                                                    random_state=42,  # Seed for reproducibility
                                                    stratify=y)  # Ensures class distribution is maintained

In [None]:
# Percentage of classes in the target variable
print(f"Number of instances in the training set: {y_train.value_counts()}")
print(f"Percentage of classes in the training set: {y_train.value_counts(normalize=True)}")
print(f"Number of instances in the test set: {y_test.value_counts()}")
print(f"Percentage of classes in the test set: {y_test.value_counts(normalize=True)}")

In [48]:
# Import StratifiedKFold for stratified cross-validation
from sklearn.model_selection import StratifiedKFold
# Initialize StratifiedKFold with number of splits, shuffle, and random state
kf = StratifiedKFold(n_splits=NOF_FOLDS, shuffle=True, random_state=42)

In [None]:
# Initialize average training accuracy to zero
average_accuracy_train = 0
# Initialize average validation accuracy to zero
average_accuracy_val = 0
# Loop over each fold index and split indices for training and validation
for fold_index, (train_index, val_index) in enumerate(kf.split(X_train, y_train)): # Note that now (stratedfied) we also use the target variable y_train to split the data
    # Split the training data into training and validation folds
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    # Split the target data into training and validation folds
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    # Print the number of instances per class in the training fold
    print(f"Number of instances per class in the training fold: {y_train_fold.value_counts()}")
    # Print the number of instances per class in the validation fold
    print(f"Number of instances per class in the validation fold: {y_val_fold.value_counts()}")

    # Initialize DecisionTreeClassifier with specified parameters
    clf = DecisionTreeClassifier(criterion='gini', max_depth=5, min_samples_leaf=2, ccp_alpha=0.01)
    # Fit the classifier on the training fold
    clf.fit(X_train_fold, y_train_fold)
    # Predict on the training fold
    y_pred_train = clf.predict(X_train_fold)
    # Calculate training accuracy
    accuracy_train = accuracy_score(y_train_fold, y_pred_train)
    # Print training accuracy for the current fold
    print(f"Training Accuracy: {accuracy_train:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    # Predict on the validation fold
    y_pred_val = clf.predict(X_val_fold)
    # Calculate validation accuracy
    accuracy_val = accuracy_score(y_val_fold, y_pred_val)
    # Print validation accuracy for the current fold
    print(f"Validation Accuracy: {accuracy_val:.5f}, fold {fold_index+1} of {NOF_FOLDS}")
    
    # Accumulate training accuracy
    average_accuracy_train += accuracy_train
    # Accumulate validation accuracy
    average_accuracy_val += accuracy_val

    # Print separator line
    print(100*"-")

# Print separator line
print(100*"-")
# Calculate average training accuracy over all folds
average_accuracy_train /= NOF_FOLDS
# Print average training accuracy
print(f"Average Training Accuracy: {average_accuracy_train:.5f}")
# Calculate average validation accuracy over all folds
average_accuracy_val /= NOF_FOLDS
# Print average cross-validation accuracy
print(f"Average Cross-validation Accuracy: {average_accuracy_val:.5f}")