# Noisy-XOR Dataset Model Comparison

## Import libraries

In [1]:
import numpy as np
import pandas as pd
from prettytable import PrettyTable

## Generate the Noisy-XOR dataset

In [2]:
def generate_noisy_xor_dataset(
    n_samples, n_features, n_xor_features, noise_level, random_state=None
):
    rng = np.random.RandomState(random_state)

    # Generate random binary features
    X = rng.randint(0, 2, size=(n_samples, n_features))

    # Ensure n_xor_features is not greater than n_features
    if n_xor_features > n_features:
        raise ValueError("n_xor_features cannot be greater than n_features")

    # Calculate y_clean based on the XOR of the first n_xor_features
    if n_xor_features == 0:
        # If no features are designated for XOR, y_clean could be all zeros or random
        # For simplicity, let's make it random if n_xor_features is 0
        y_clean = rng.randint(0, 2, size=n_samples)
    else:
        y_clean = X[:, 0]
        for i in range(1, n_xor_features):
            y_clean = np.logical_xor(y_clean, X[:, i])
        y_clean = y_clean.astype(int)

    # Introduce noise
    n_noise = int(noise_level * n_samples)
    noise_indices = rng.choice(n_samples, size=n_noise, replace=False)

    y = np.copy(y_clean)
    y[noise_indices] = 1 - y[noise_indices]  # Flip the bits

    return X.astype(np.uint8), y.astype(np.uint32)

In [3]:
from sklearn.model_selection import train_test_split


# Generate the dataset with specified parameters
X_data, y_data = generate_noisy_xor_dataset(
    n_samples=1000, n_features=16, n_xor_features=2, noise_level=0.1, random_state=42
)

# Display dataset information
print("Dataset Information:")
print(f"X: {X_data.shape}, {np.unique(X_data)}")
print(f"y: {y_data.shape}, {np.unique(y_data)}")


# Display the first N_DISPLAY_SAMPLES samples
N_DISPLAY_SAMPLES = 5
for x_sample, y_sample in zip(X_data[:N_DISPLAY_SAMPLES], y_data[:N_DISPLAY_SAMPLES]):
    print(f"X: {x_sample}, y: {y_sample}")

# Split the dataset into training and testing sets
X_train_np, X_test_np, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, stratify=y_data, random_state=42
)

# Define feature names
n_features = X_data.shape[1]
feature_names = [f"feature_{i}" for i in range(n_features)]

# Convert NumPy arrays to Pandas DataFrames
X_train = pd.DataFrame(X_train_np, columns=feature_names)
X_test = pd.DataFrame(X_test_np, columns=feature_names)

# Print proportions of classes in training and testing sets
print("Training set class proportions:")
print(f"  Class 0: {np.mean(y_train == 0):.2f}, Class 1: {np.mean(y_train == 1):.2f}")
print("Testing set class proportions:")
print(f"  Class 0: {np.mean(y_test == 0):.2f}, Class 1: {np.mean(y_test == 1):.2f}")

Dataset Information:
X: (1000, 16), [0 1]
y: (1000,), [0 1]
X: [0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0], y: 1
X: [1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0], y: 0
X: [1 1 1 0 1 0 0 0 0 0 1 1 1 1 1 0], y: 0
X: [1 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0], y: 0
X: [0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1], y: 1
Training set class proportions:
  Class 0: 0.53, Class 1: 0.47
Testing set class proportions:
  Class 0: 0.53, Class 1: 0.47


## Evaluate the models

In [4]:
import pickle
from time import perf_counter
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


def evaluate_model(model, X_train, y_train, X_test, y_test, name=None):
    # Fit the model
    start_time = perf_counter()
    model.fit(X_train, y_train)
    training_time = perf_counter() - start_time

    # Estimate the model size
    model_size = None
    try:
        if name in (
            "Green Tsetlin Classifier",
            "Green Tsetlin Sparse Classifier",
            "C Tsetlin Classifier",
            "GridSearch C Tsetlin Classifier",
            "C Tsetlin Sparse Classifier",
            "GridSearch C Tsetlin Sparse Classifier",
        ):
            if isinstance(model, GridSearchCV):
                model_size = model.best_estimator_.estimate_model_size()
            else:
                model_size = model.estimate_model_size()
        else:
            pickled_model = pickle.dumps(model)
            model_size = len(pickled_model)
    except Exception:
        pass

    # Predict on the training set
    start_time = perf_counter()
    y_train_pred = model.predict(X_train)
    prediction_train_time = perf_counter() - start_time

    # Predict on the test set
    start_time = perf_counter()
    y_test_pred = model.predict(X_test)
    prediction_test_time = perf_counter() - start_time

    # Calculate Accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # print(f"Model: {name}")
    # print(f"  Training Accuracy: {train_accuracy:.4f}")
    # print(f"  Test Accuracy: {test_accuracy:.4f}")

    if isinstance(model, GridSearchCV):
        print(f"Best parameters for {name}: \n  {model.best_params_}")

    return (
        name,
        train_accuracy,
        test_accuracy,
        training_time,
        prediction_train_time,
        prediction_test_time,
        model_size,
    )

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from tsetlin_machine_py.green_tsetlin_clf import GreenTsetlinClassifier
from tsetlin_machine_py.green_tsetlin_sparse_clf import GreenTsetlinSparseClassifier
from tsetlin_machine_py.c_tsetlin_clf import CTsetlinClassifier
from tsetlin_machine_py.c_tsetlin_sparse_clf import CTsetlinSparseClassifier

models = [
    (LogisticRegression(max_iter=1000, random_state=42), "Logistic Regression"),
    (DecisionTreeClassifier(random_state=42), "Decision Tree"),
    (RandomForestClassifier(random_state=42), "Random Forest"),
    (MLPClassifier(max_iter=1000, solver="lbfgs", random_state=42), "MLP Classifier"),
    (SVC(random_state=42), "SVC"),
    (LinearSVC(random_state=42, dual="auto"), "Linear SVC"),  # type: ignore
    (KNeighborsClassifier(), "K-Nearest Neighbors"),
    (GaussianNB(), "Gaussian Naive Bayes"),
    (GradientBoostingClassifier(random_state=42), "Gradient Boosting"),
    (LGBMClassifier(random_state=42, verbose=-1), "LightGBM"),
]

# GridSearchCV for RandomForest
param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
}
grid_search_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    scoring="accuracy",
    n_jobs=-1,
)
models.append((grid_search_rf, "GridSearch Random Forest"))

# GridSearchCV for MLPClassifier
param_grid_mlp = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50)],
    "activation": ["relu", "tanh"],
    "alpha": [0.0001, 0.001, 0.01],
}
grid_search_mlp = GridSearchCV(
    MLPClassifier(max_iter=1000, solver="lbfgs", random_state=42),
    param_grid_mlp,
    scoring="accuracy",
    n_jobs=-1,
)
models.append((grid_search_mlp, "GridSearch MLP Classifier"))

# Green Tsetlin Classifier
green_tsetlin_clf = GreenTsetlinClassifier(random_state=42)
models.append((green_tsetlin_clf, "Green Tsetlin Classifier"))

# Green Tsetlin Sparse Classifier
green_tsetlin_sparse_clf = GreenTsetlinSparseClassifier(random_state=42)
models.append((green_tsetlin_sparse_clf, "Green Tsetlin Sparse Classifier"))

# C Tsetlin Classifier
c_tsetlin_clf = CTsetlinClassifier(random_state=42)
models.append((c_tsetlin_clf, "C Tsetlin Classifier"))

# GridSearchCV for C Tsetlin Classifier
param_grid_ctsetlin = {
    "num_clauses": [64, 125, 500],
    "boost_true_positive_feedback": [False, True],
    "s": [3, 9],
}
grid_search_ctsetlin = GridSearchCV(
    CTsetlinClassifier(random_state=42),
    param_grid_ctsetlin,
    scoring="accuracy",
    n_jobs=-1,
)
models.append((grid_search_ctsetlin, "GridSearch C Tsetlin Classifier"))

# C Tsetlin Sparse Classifier
c_tsetlin_sparse_clf = CTsetlinSparseClassifier(random_state=42)
models.append((c_tsetlin_sparse_clf, "C Tsetlin Sparse Classifier"))

# GridSearchCV for C Tsetlin Sparse Classifier
param_grid_ctsetlin_sparse = {
    "num_clauses": [64, 125, 500],
    "boost_true_positive_feedback": [False, True],
    "s": [3, 9],
}
grid_search_ctsetlin_sparse = GridSearchCV(
    CTsetlinSparseClassifier(random_state=42),
    param_grid_ctsetlin_sparse,
    scoring="accuracy",
    n_jobs=-1,
)
models.append((grid_search_ctsetlin_sparse, "GridSearch C Tsetlin Sparse Classifier"))

# Evaluate each model
results_table = PrettyTable()
results_table.field_names = [
    "Model",
    "Training Accuracy",
    "Test Accuracy",
    "Training Time (s)",
    "Prediction Train Time (s)",
    "Prediction Test Time (s)",
    "Model Size (bytes)",
]
for i, (model, name) in enumerate(models):
    metrics = evaluate_model(model, X_train, y_train, X_test, y_test, name=name)
    results_table.add_row(
        [
            metrics[0],
            f"{metrics[1]:.4f}",
            f"{metrics[2]:.4f}",
            f"{metrics[3]:.4f}",
            f"{metrics[4]:.4f}",
            f"{metrics[5]:.4f}",
            f"{metrics[6]:,}" if metrics[6] else "N/A",
        ]
    )
    print(f"Evaluated {i + 1}/{len(models)}: {name}")

Evaluated 1/18: Logistic Regression
Evaluated 2/18: Decision Tree
Evaluated 3/18: Random Forest
Evaluated 4/18: MLP Classifier
Evaluated 5/18: SVC
Evaluated 6/18: Linear SVC
Evaluated 7/18: K-Nearest Neighbors
Evaluated 8/18: Gaussian Naive Bayes
Evaluated 9/18: Gradient Boosting
Evaluated 10/18: LightGBM
Best parameters for GridSearch Random Forest: 
  {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Evaluated 11/18: GridSearch Random Forest
Best parameters for GridSearch MLP Classifier: 
  {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (100,)}
Evaluated 12/18: GridSearch MLP Classifier
Evaluated 13/18: Green Tsetlin Classifier
Evaluated 14/18: Green Tsetlin Sparse Classifier
Evaluated 15/18: C Tsetlin Classifier
Best parameters for GridSearch C Tsetlin Classifier: 
  {'boost_true_positive_feedback': False, 'num_clauses': 64, 's': 3}
Evaluated 16/18: GridSearch C Tsetlin Classifier
Evaluated 17/18: C Tsetlin Sparse Classifier
Best parameters for GridSearch

In [6]:
# Display the results table
results_table.sortby = "Test Accuracy"
results_table.reversesort = True
# print(results_table)

# Get data from PrettyTable
columns = results_table.field_names
data = results_table.rows

# Create Pandas DataFrame
df = pd.DataFrame(data, columns=columns)

# print("\nResults for Noisy-XOR dataset:")
display(df)

Unnamed: 0,Model,Training Accuracy,Test Accuracy,Training Time (s),Prediction Train Time (s),Prediction Test Time (s),Model Size (bytes)
0,Logistic Regression,0.5487,0.475,0.0028,0.0005,0.0003,1122
1,Decision Tree,0.9962,0.735,0.002,0.0004,0.0004,33536
2,Random Forest,0.9962,0.87,0.079,0.0068,0.0032,4127036
3,MLP Classifier,0.9962,0.815,0.1067,0.0008,0.0004,35897
4,SVC,0.895,0.93,0.0192,0.0146,0.004,104823
5,Linear SVC,0.545,0.475,0.0013,0.0008,0.0004,1006
6,K-Nearest Neighbors,0.825,0.765,0.0008,0.0217,0.0062,20179
7,Gaussian Naive Bayes,0.5513,0.47,0.0012,0.0009,0.0004,1373
8,Gradient Boosting,0.8962,0.915,0.0767,0.0017,0.0006,139263
9,LightGBM,0.985,0.895,0.0502,0.0016,0.0007,350233
