# Libraries

In [1]:
# Uncomment the following line to install the ucimlrepo package
# !pip install ucimlrepo

In [2]:
# pandas is a well-known python library for data manipulation and analysis
# It is used to load, and analyze data ususally stored in dataframes
# The dataframes are tables with rows and columns
# The rows are called samples or observations
# The columns are called features or variables
import pandas as pd

# Import Dataset

In [None]:
# https://archive.ics.uci.edu/dataset/222/bank+marketing

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
X = bank_marketing.data.features 
y = bank_marketing.data.targets 
  
# metadata 
print(bank_marketing.metadata) 

In [None]:
# variable information 
print(bank_marketing.variables) 

In [None]:
# Display the first 5 rows of the features
X.head()

In [None]:
# Display the first 5 rows of the targets
y.head()

In [None]:
# Display 5 random rows of the features
X.sample(5)

In [None]:
# Display 5 random rows of the targets
y.sample(5)

In [None]:
# Features names
feature_names = X.columns
# Target names
target_name = y.columns
# Display the feature names
print(feature_names)
# Display the target names
print(target_name)

In [10]:
target_name = "subscribe"

# Missing values

In [None]:
# Columns with missing values
missing_values = X.isnull().sum()
# Display the columns with missing values
print(missing_values)

In [None]:
# Exclude the columns with missing values
X = X.dropna(axis=1)
# Display the columns with missing values
missing_values = X.isnull().sum()
print(missing_values)

# Descriptive statistics metrics

In [None]:
# Calculate descriptive statistics for numerical features
numerical_stats = X.describe()
print("Descriptive statistics for numerical features:")
print(numerical_stats)

In [None]:
# Calculate descriptive statistics for categorical features
# include=['O'] is used to include only the categorical features
categorical_stats = X.describe(include=['O'])
# Print a message indicating that the descriptive statistics for categorical features will be displayed
print("\nDescriptive statistics for categorical features:")
# Print the descriptive statistics for categorical features
print(categorical_stats)

# Calculate and display the number of instances for each class in the categorical features
for column in X.select_dtypes(include=['O']).columns:
    print(f"\nNumber of instances per class for the categorical feature '{column}':")
    class_counts = X[column].value_counts()
    print(class_counts)

# Descriptive statistics plots

In [None]:
# Visualize the numerical data
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Plot histograms for numerical features
for column in X.select_dtypes(include=[np.number]).columns:
    plt.figure(figsize=(4, 3))
    sns.histplot(X[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
# Visualize the categorical data
# Visualize the categorical data
import matplotlib.pyplot as plt
import seaborn as sns

# Plot bar plots for categorical features
for column in X.select_dtypes(include=['O']).columns:
    plt.figure(figsize=(4, 3))
    sns.countplot(x=column, data=X, order=X[column].value_counts().index)
    plt.title(f'Bar plot of {column}')
    plt.xlabel(column)
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.show()

# Encode features

In [None]:
# Encode the categorical variables using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Display the encoded features and target
print(X.head())

In [None]:
# The feature names are the column names of the encoded features
feature_names = X.columns
feature_names

In [None]:
# Encode the target variable
y_encoded = y['y'].map({'yes': 1, 'no': 0})
print(y_encoded.head())

In [20]:
# Convert the encoded features to a numpy array
X = X.to_numpy()

In [21]:
# Convert the true/false values to floats
X = X.astype(float)

# Associations all by all features

In [22]:
import plotly.graph_objects as go

In [23]:
# Import necessary libraries for linear regression and correlation calculation
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

# Define a function to plot numeric vs numeric data
def plot_numeric_numeric(xx, yy, xx_name, yy_name, plot_figure=False):
    # Sort the data based on xx values
    iso = np.argsort(xx)
    xx = xx[iso]
    yy = yy[iso]
    
    # Initialize and fit a linear regression model
    model = LinearRegression()
    model.fit(xx.reshape(-1, 1), yy)
    y_pred = model.predict(xx.reshape(-1, 1))
    
    # Calculate Pearson correlation coefficient
    pearson_corr, _ = pearsonr(xx, yy)
    
    # Plot the figure if plot_figure is True
    if plot_figure: 
        # Keep only 1000 points equally spaced for plotting
        xx = xx[::len(xx)//1000]
        yy = yy[::len(yy)//1000]
        y_pred = y_pred[::len(y_pred)//1000]
        
        # Calculate R-squared value
        r_squared = model.score(xx.reshape(-1, 1), yy)
        
        # Prepare the plot title with correlation and regression details
        plot_title = f"Pearson Correlation={pearson_corr:.5f}, R-Squared={r_squared:.5f}, "    
        print(plot_title + f"{yy_name}={model.coef_[0]:.2f}({xx_name})+{model.intercept_:.2f}")
        
        # Create a plotly figure
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=xx, y=yy, mode='markers', name=f'Given Data Points of {xx_name} vs {yy_name}'))
        fig.add_trace(go.Scatter(x=xx, y=y_pred, mode='lines+markers', name='Trend Line', marker=dict(color='grey')))
        
        # Update layout of the figure
        fig.update_layout(title=plot_title, xaxis_title=xx_name, yaxis_title=yy_name, template='plotly_white', 
                          legend=dict(yanchor="top", y=1.05, xanchor="left", x=0.01), width=500, height=400, font=dict(size=10))   
        fig.show()
    
    # Return the Pearson correlation coefficient
    return pearson_corr

In [24]:
# Import necessary libraries for logistic regression and evaluation metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

# Define a function to plot numeric vs categorical data
def plot_numeric_categorical(xx, yy, xx_name, yy_name, plot_figure=False):
    # Sort the data based on xx values
    iso = np.argsort(xx)
    xx = xx[iso]
    yy = yy[iso]
    
    # Initialize and fit a logistic regression model
    model = LogisticRegression()
    model.fit(xx.reshape(-1, 1), yy)
    
    # Predict the binary outcomes and probabilities
    y_pred = model.predict(xx.reshape(-1, 1))
    y_pred_proba = model.predict_proba(xx.reshape(-1, 1))[:, 1]
    
    # Calculate F1 score
    accuracy1 = f1_score(yy, y_pred)

    # Plot the figure if plot_figure is True
    if plot_figure:
        # Keep only 1000 points equally spaced for plotting
        xx = xx[::len(xx)//1000]
        yy = yy[::len(yy)//1000]
        y_pred_proba = y_pred_proba[::len(y_pred_proba)//1000]
        y_pred = y_pred[::len(y_pred)//1000]
        
        # Prepare the plot title with evaluation metrics
        plot_title = f"F1 Score={accuracy1:.2f}, Accuracy={accuracy_score(yy, y_pred):.2f}, AUC={roc_auc_score(yy, y_pred_proba):.2f}"
        print(plot_title + f", {yy_name}=1/(1+exp(-({model.coef_[0][0]:.2f}({xx_name}) + {model.intercept_[0]:.2f})))")
        
        # Create a plotly figure
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=xx, y=yy, mode='markers', name='Given Data Points'))
        fig.add_trace(go.Scatter(x=xx, y=y_pred_proba, mode='lines+markers', name='Logistic Regression Predictions', 
                        marker=dict(color='grey')))
        fig.add_trace(go.Scatter(x=xx, y=y_pred, mode='markers', name='Logistic Regression Predictions (binary)'))
        
        # Update layout of the figure
        fig.update_layout(title=plot_title, xaxis_title=xx_name, yaxis_title=yy_name, template='plotly_white', width=500, height=400, font=dict(size=10))
        fig.show()
    
    # Return the F1 score
    return accuracy1

In [25]:

def plot_categorical_numeric(xx, yy, xx_name, yy_name, plot_figure=False):
    groups_anova = [yy[xx == groupAnv] for groupAnv in np.unique(xx)]  # Split yy by xx groups
    ss_total = np.sum((yy - np.mean(yy)) ** 2)  # Total sum of squares
    ss_between = np.sum([len(groupAnv) * (np.mean(groupAnv) - np.mean(yy)) ** 2 for groupAnv in groups_anova])  # Between-group sum of squares
    # anova_eta_squared is the ratio of the between-group sum of squares to the total sum of squares.
    # It measures the proportion of the total variance in the dependent variable that is explained by the independent variable.
    # The closer the value is to 1, the stronger the relationship between the independent variable and the dependent variable.
    anova_eta_squared = ss_between / ss_total
    if plot_figure:
        # Get unique classes from xx
        unique_classes = np.unique(xx)
        
        # Create the plot title
        my_title = f"Histograms of {yy_name} for each class of {xx_name}"
        print(my_title + f", Eta Squared={anova_eta_squared:.2f}")
        
        # Initialize a plotly figure
        fig = go.Figure()
        
        # Add histogram traces for each unique class
        for cls in unique_classes:
            fig.add_trace(go.Histogram(x=yy[xx == cls], name=f'{xx_name}={cls}', opacity=0.75))
        
        # Update the layout of the figure
        fig.update_layout(barmode='overlay', title=my_title, 
                          xaxis_title=yy_name, yaxis_title='Count', template='plotly_white', width=500, height=400, font=dict(size=10))
        
        # Display the figure
        fig.show()
        
    return anova_eta_squared

In [26]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import plotly.express as px
def plot_categorical_categorical(xx, yy, xx_name, yy_name, plot_figure=False):
    # Convert input arrays to boolean arrays for logical operations
    xx_bool = np.array(xx, dtype=bool)
    yy_bool = np.array(yy, dtype=bool)
    
    # Calculate precision, recall, and F1 score
    my_precision = np.sum(xx_bool & yy_bool) / np.sum(xx_bool) if np.sum(xx_bool) > 0 else 0
    my_recall = np.sum(xx_bool & yy_bool) / np.sum(yy_bool) if np.sum(yy_bool) > 0 else 0
    my_f1_score = 2 * (my_precision * my_recall) / (my_precision + my_recall) if (my_precision + my_recall) > 0 else 0
    
    if plot_figure:
        # Generate confusion matrix
        conf_matrix = confusion_matrix(xx, yy)
        
        # Create labels for the confusion matrix axes
        labels_xy = sorted(set(xx) | set(yy))  # Unique labels from both xx and yy
        labels_x = [f"{xx_name}={str(int(label))}" for label in labels_xy]
        labels_y = [f"{yy_name}={str(int(label))}" for label in labels_xy]

        # Set the title for the plot
        my_title = f"Confusion Matrix of {xx_name} vs {yy_name}"
        print(my_title + f", F1 Score={my_f1_score:.2f}, Accuracy={accuracy_score(xx, yy):.2f}, AUC={roc_auc_score(xx, yy):.2f}")
        
        # Create and display the plot using Plotly
        fig = px.imshow(conf_matrix, title=my_title, 
                        x=labels_x, y=labels_y, text_auto=True, color_continuous_scale='Blues')
        fig.update_layout(font=dict(size=10), width=500, height=400)
        fig.show()
    
    # Return the F1 score and its label
    return my_f1_score

In [27]:
is_feature_categorical = []  # Initialize an empty list to store whether each feature is categorical
for i in range(X.shape[1]):  # Iterate over each feature in the dataset
    if np.unique(X[:, i]).size > 2:  # Check if the feature has more than 2 unique values
        is_feature_categorical.append(False)  # If yes, it is not categorical
    else:
        is_feature_categorical.append(True)  # If no, it is categorical

In [None]:
# Initialize lists to store different types of correlations
all_numeric_numeric_correlations = []
all_numeric_categorical_correlations = []
all_categorical_categorical_correlations = []
all_categorical_numeric_correlations = []

# Iterate over each feature pair in the dataset
all_by_all_associations = np.zeros((X.shape[1], X.shape[1]))
for i in range(X.shape[1]):
    for j in range(X.shape[1]):
        if i != j:  # Ensure we are not comparing the same feature
            # Check if both features are numeric
            if not is_feature_categorical[i] and not is_feature_categorical[j]:
                pearson_corr = plot_numeric_numeric(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=False)
                print(f"Pearson correlation between {feature_names[i]} and {feature_names[j]} is {pearson_corr:.5f}")
                all_numeric_numeric_correlations.append((i, j, pearson_corr))
                all_by_all_associations[i, j] = pearson_corr
            # Check if the first feature is numeric and the second is categorical
            elif not is_feature_categorical[i] and is_feature_categorical[j]:
                anova_eta_squared = plot_numeric_categorical(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=False)
                print(f"ANOVA Eta Squared between {feature_names[i]} and {feature_names[j]} is {anova_eta_squared:.5f}")
                all_numeric_categorical_correlations.append((i, j, anova_eta_squared))
                all_by_all_associations[i, j] = anova_eta_squared
            # Check if both features are categorical
            elif is_feature_categorical[i] and is_feature_categorical[j]:
                my_f1_score = plot_categorical_categorical(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=False)
                print(f"F1 Score between {feature_names[i]} and {feature_names[j]} is {my_f1_score:.5f}")
                all_categorical_categorical_correlations.append((i, j, my_f1_score))
                all_by_all_associations[i, j] = my_f1_score
            # Check if the first feature is categorical and the second is numeric
            elif is_feature_categorical[i] and not is_feature_categorical[j]:
                anova_eta_squared = plot_categorical_numeric(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=False)
                print(f"ANOVA Eta Squared between {feature_names[i]} and {feature_names[j]} is {anova_eta_squared:.5f}")
                all_categorical_numeric_correlations.append((i, j, anova_eta_squared))
                all_by_all_associations[i, j] = anova_eta_squared

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(15, 15))
sns.heatmap(all_by_all_associations, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=feature_names, yticklabels=feature_names)
plt.title("Feature Associations Matrix")
plt.show()


## Top numeric-numeric

In [None]:
top_numeric_numeric_correlations = sorted(all_numeric_numeric_correlations, key=lambda x: x[2], reverse=True)
top_numeric_numeric_correlations[:3]
for i, j, corr in top_numeric_numeric_correlations[:3]:
    plot_numeric_numeric(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=True)

## Top numeric-categorical

In [None]:
top_numeric_categorical_correlations = sorted(all_numeric_categorical_correlations, key=lambda x: x[2], reverse=True)
top_numeric_categorical_correlations[:3]
for i, j, corr in top_numeric_categorical_correlations[:3]:
    plot_numeric_categorical(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=True)

## Top categorical-categorical

In [None]:
top_categorical_categorical_correlations = sorted(all_categorical_categorical_correlations, key=lambda x: x[2], reverse=True)
top_categorical_categorical_correlations[:3]
for i, j, corr in top_categorical_categorical_correlations[:3]:
    plot_categorical_categorical(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=True)

## Top categorical-numeric

In [None]:
top_categorical_numeric_correlations = sorted(all_categorical_numeric_correlations, key=lambda x: x[2], reverse=True)
top_categorical_numeric_correlations[:3]
for i, j, corr in top_categorical_numeric_correlations[:3]:
    plot_categorical_numeric(X[:, i], X[:, j], feature_names[i], feature_names[j], plot_figure=True)

# Train a decision tree classifier

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
# max_depth is the maximum depth of the tree
clf = DecisionTreeClassifier(max_depth=3)

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Visualize the decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=feature_names, 
          class_names=np.unique(y.values), fontsize=10)
plt.show()


In [None]:
from sklearn.tree import export_text
tree_rules = export_text(clf, feature_names=feature_names)
print(tree_rules)

In [None]:
# Calculate feature importances
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier.feature_importances_
feature_importances = clf.feature_importances_

# Create a DataFrame for better visualization
import pandas as pd
feature_importances_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(feature_importances_df)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_importances_df['Feature'], feature_importances_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()
