In [1]:
# pandas is a well-known python library for data manipulation and analysis
# It is used to load, and analyze data ususally stored in dataframes
# The dataframes are tables with rows and columns
# The rows are called samples or observations
# The columns are called features or variables
import pandas as pd

In [2]:
# Uncomment the following line to install the ucimlrepo package
# !pip install ucimlrepo

# 1st Dataset

In [None]:
# https://archive.ics.uci.edu/dataset/58/lenses

# ucimlrepo is a python package that allows us to fetch datasets from the UCI Machine Learning Repository
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
lenses = fetch_ucirepo(id=58) 
  
# data is a dictionary with features and targets
# We store the features in X and the targets in y
X = lenses.data.features 
y = lenses.data.targets 
  
# metadata 
print(lenses.metadata) 
  
# variable information 
print(lenses.variables) 

In [None]:
# Display the first 5 rows of the features
X.head()

In [None]:
# Display the first 5 rows of the targets
y.head()

In [None]:
# Define the mappings for features
# We need these mappings to convert the features (numerical values) to categorical variables
feature_mappings = {
    "age": {1: "young", 2: "pre-presbyopic", 3: "presbyopic"},
    "spectacle_prescription": {1: "myope", 2: "hypermetrope"},
    "astigmatic": {1: "no", 2: "yes"}
}
print(feature_mappings)

# Define the mapping for the target
target_mapping = {
    1: "hard contact lenses",
    2: "soft contact lenses",
    3: "no contact lenses"
}
print(target_mapping)

In [None]:
# Replace feature values with categories
Xcat = X.replace(feature_mappings)

# Replace target values with categories
ycat = y.replace(target_mapping)

# Display updated data
print("Features (Categorical):")
print(Xcat.head())
print("\nTargets (Categorical):")
print(ycat.head())

# Second Dataset

In [8]:
# # After running for the first dataset, uncomment the following lines to load the second dataset, and run the code after this
# weather = pd.read_csv('https://raw.githubusercontent.com/dataprofessor/data/refs/heads/master/weather-nominal-weka.csv')
# # Xcat are the features (all columns except the target, which is 'play')
# Xcat = weather.drop(columns=['play'])
# # ycat is the target (the column 'play')
# ycat = weather['play']
# # Make ycat a pandas dataframe (not a series). We need this to have a unified way of handling data with one or more columns
# ycat = ycat.to_frame()


In [None]:
print(type(Xcat))
print(type(ycat))

In [None]:
Xcat

In [None]:
ycat

In [None]:
print(Xcat.shape)
print(ycat.shape)

In [None]:
# Visualize the data
import seaborn as sns
import matplotlib.pyplot as plt
# Iterate over the features
for feature in Xcat.columns:
    # ax is the plot
    ax = sns.countplot(x=feature, data=Xcat)
    # Iterate over the patches in the plot. patches are the bars
    for p in ax.patches:
        # Annotate the height of the bar which is the count of the feature
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='baseline')
    plt.show()
# Iterate over the targets
for target in ycat.columns:
    sns.countplot(x=target, data=ycat)
    ax = sns.countplot(x=target, data=ycat)
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='baseline')
    plt.show()


In [None]:
# Encode the categorical variables using one-hot encoding
XcatEncoded = pd.get_dummies(Xcat)
# Display the encoded data
XcatEncoded.head()

In [None]:
# Combine the features and the target
import numpy as np
# Combine the features and the target
data_combined = pd.concat([XcatEncoded, pd.get_dummies(ycat)], axis=1)
# Names of the combined data
all_variable_names = data_combined.columns
# Convert the combined data to a numpy array
data_combined = data_combined.to_numpy()
print(data_combined)
print(all_variable_names)


In [None]:
# Convert the combined data to integers
data_combined = data_combined.astype(int)
# Display the combined data
data_combined


# Associations between all features and targets

In [None]:
# Initialize the accuracy matrix with zeros. Dimensions are the number of features and targets (combined data)
accuracy_matrix = np.zeros((len(all_variable_names), len(all_variable_names)))
# Iterate over the combined data
for i in range(len(all_variable_names)):
    # Iterate over the combined data
    for j in range(len(all_variable_names)):
        # Calculate the accuracy of the association between the two features
        # The accuracy is the mean of the equality of the two features, which is 1 if they are equal and 0 if they are not
        accuracy_matrix[i, j] = np.mean(data_combined[:, i] == data_combined[:, j])
# Display the accuracy matrix
accuracy_matrix



In [None]:
# Visualize the accuracy matrix
plt.imshow(accuracy_matrix, cmap='Blues', interpolation='nearest')
# Add the variable names to the plot
plt.xticks(range(len(all_variable_names)), all_variable_names, rotation=90)
plt.yticks(range(len(all_variable_names)), all_variable_names)
# Add the accuracy values to the plot
for i in range(len(all_variable_names)):
    for j in range(len(all_variable_names)):
        # Display the accuracy as a percentage
        plt.text(j, i, f'{100*accuracy_matrix[i, j]:.0f}%', ha='center', va='center', color='black')
plt.colorbar()
plt.show()



# Top i,j, pairs with the highest accuracy, except main diagonal

In [None]:
# Set the values above the diagonal to 0
accuracy_matrix = np.tril(accuracy_matrix, k=0)
accuracy_matrix

# Get the indices and values of the top accuracy pairs

In [None]:
# Flatten the accuracy matrix
flattened_accuracy_matrix = accuracy_matrix.flatten()
flattened_accuracy_matrix

In [None]:
# Get the indices that would sort the flattened accuracy matrix in descending order
sorted_indices = np.argsort(flattened_accuracy_matrix)[::-1]
sorted_indices

In [None]:
# Get the 2D indices of the sorted values. 
# This is a tuple of two arrays, one for the row indices and one for the column indices
unraveled_indices = np.unravel_index(sorted_indices, accuracy_matrix.shape)
unraveled_indices

In [None]:
# Stack the indices into a single array
stacked_indices = np.dstack(unraveled_indices)
stacked_indices

In [None]:
# The shape of stacked_indices is (1, 81, 2) because it is a 3D array 
# with one "layer" or "batch" of 81 pairs, each containing two elements. 
# This is a result of how the np.dstack function works, 
# which stacks arrays in sequence depth-wise (along the third axis).
stacked_indices.shape

In [None]:
# Extract the first element from the stacked indices
flat_indices = stacked_indices[0]
flat_indices

In [26]:
nof_pairs = 2

In [None]:
top_pairs = []
for i, j in flat_indices:
    # Check if the accuracy is between -1 and 1, to avoid the main diagonal
    if accuracy_matrix[i, j] > -1 and accuracy_matrix[i, j] < 1:
        top_pairs.append((i, j, accuracy_matrix[i, j]))

print(top_pairs[:nof_pairs])


In [None]:
# We use top_pairs[0] to get the first pair of indices, 
# and then we use the first element of the pair to get the row index, 
# and the second element to get the column index
print(f"the first pair of indices is {top_pairs[0][0]} and {top_pairs[0][1]}")
print(f"the first variable is\n{data_combined[:, top_pairs[0][0]]}")
print(f"and the second variable is\n{data_combined[:, top_pairs[0][1]]}")


In [None]:
np.mean(data_combined[:, top_pairs[0][0]] == data_combined[:, top_pairs[0][1]])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
for i, j, acc in top_pairs[:nof_pairs]:
    print(f"{all_variable_names[i]} and {all_variable_names[j]} have an accuracy of {acc:.2f}")
    # Calculate the confusion matrix cm
    cm = confusion_matrix(data_combined[:, i], data_combined[:, j])
    # Display the confusion matrix, using seaborn's heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    # Add the variable names to the plot
    plt.xlabel(all_variable_names[j])
    plt.ylabel(all_variable_names[i])
    # Add the accuracy to the plot as a title
    plt.title(f"Accuracy: {accuracy_score(data_combined[:, i], data_combined[:, j]):.5f}")
    plt.show()

# Train a decision tree classifier

In [None]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(XcatEncoded, ycat, test_size=0.2, random_state=42)

# Create a decision tree classifier
# max_depth is the maximum depth of the tree
clf = DecisionTreeClassifier(max_depth=3)

clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

In [None]:
# Visualize the decision tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(20,10))
plot_tree(clf, filled=True, feature_names=XcatEncoded.columns, 
          class_names=np.unique(ycat.values), fontsize=10)
plt.show()


In [None]:
from sklearn.tree import export_text
tree_rules = export_text(clf, feature_names=list(XcatEncoded.columns))
print(tree_rules)