# Noisy-XOR Dataset Model Comparison

## Import libraries

In [1]:
import numpy as np
import pandas as pd

## Generate the Noisy-XOR dataset

In [2]:
def generate_noisy_xor_dataset(
    n_samples, n_features, n_xor_features, noise_level, random_state=None
):
    rng = np.random.RandomState(random_state)

    # Generate random binary features
    X = rng.randint(0, 2, size=(n_samples, n_features))

    # Ensure n_xor_features is not greater than n_features
    if n_xor_features > n_features:
        raise ValueError("n_xor_features cannot be greater than n_features")

    # Calculate y_clean based on the XOR of the first n_xor_features
    if n_xor_features == 0:
        # If no features are designated for XOR, y_clean could be all zeros or random
        # For simplicity, let's make it random if n_xor_features is 0
        y_clean = rng.randint(0, 2, size=n_samples)
    else:
        y_clean = X[:, 0]
        for i in range(1, n_xor_features):
            y_clean = np.logical_xor(y_clean, X[:, i])
        y_clean = y_clean.astype(int)

    # Introduce noise
    n_noise = int(noise_level * n_samples)
    noise_indices = rng.choice(n_samples, size=n_noise, replace=False)

    y = np.copy(y_clean)
    y[noise_indices] = 1 - y[noise_indices]  # Flip the bits

    return X.astype(np.uint8), y.astype(np.uint32)

In [3]:
from sklearn.model_selection import train_test_split


# Generate the dataset with specified parameters
X_data, y_data = generate_noisy_xor_dataset(
    n_samples=1000, n_features=16, n_xor_features=2, noise_level=0.1, random_state=42
)

# Display dataset information
print("Dataset Information:")
print(f"X: {X_data.shape}, {np.unique(X_data)}")
print(f"y: {y_data.shape}, {np.unique(y_data)}")


# Display the first N_DISPLAY_SAMPLES samples
N_DISPLAY_SAMPLES = 5
for x_sample, y_sample in zip(X_data[:N_DISPLAY_SAMPLES], y_data[:N_DISPLAY_SAMPLES]):
    print(f"X: {x_sample}, y: {y_sample}")

# Split the dataset into training and testing sets
X_train_np, X_test_np, y_train, y_test = train_test_split(
    X_data, y_data, test_size=0.2, stratify=y_data, random_state=42
)

# Define feature names
n_features = X_data.shape[1]
feature_names = [f"feature_{i}" for i in range(n_features)]

# Convert NumPy arrays to Pandas DataFrames
X_train = pd.DataFrame(X_train_np, columns=feature_names)
X_test = pd.DataFrame(X_test_np, columns=feature_names)

# Print proportions of classes in training and testing sets
print("Training set class proportions:")
print(f"  Class 0: {np.mean(y_train == 0):.2f}, Class 1: {np.mean(y_train == 1):.2f}")
print("Testing set class proportions:")
print(f"  Class 0: {np.mean(y_test == 0):.2f}, Class 1: {np.mean(y_test == 1):.2f}")

Dataset Information:
X: (1000, 16), [0 1]
y: (1000,), [0 1]
X: [0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0], y: 1
X: [1 1 1 0 1 0 1 1 1 1 1 1 1 1 0 0], y: 0
X: [1 1 1 0 1 0 0 0 0 0 1 1 1 1 1 0], y: 0
X: [1 1 0 1 0 1 0 1 1 0 0 0 0 0 0 0], y: 0
X: [0 1 1 0 1 1 1 1 0 1 0 1 1 1 0 1], y: 1
Training set class proportions:
  Class 0: 0.53, Class 1: 0.47
Testing set class proportions:
  Class 0: 0.53, Class 1: 0.47


## Evaluate the models

In [4]:
from tsetlin_machine_py.c_tsetlin_clf import CTsetlinClassifier
from sklearn.metrics import accuracy_score

batch_size = 10
steps = 800

assert X_train.shape[0] >= steps, (
    f"{batch_size=} * {steps=} exceeds training set size {X_train.shape[0]=}"
)

c_tsetlin_clf = CTsetlinClassifier(random_state=42)
classes = np.unique(y_train)
print(f"Classes: {classes}")
c_tsetlin_clf.init_empty_state(n_features=X_train.shape[1], classes=classes)

for start_idx in range(0, steps, batch_size):
    end_idx = min(start_idx + batch_size, X_train.shape[0])
    X_train_batch = X_train.iloc[start_idx:end_idx]
    y_train_batch = y_train[start_idx:end_idx]

    c_tsetlin_clf.partial_fit(X_train_batch, y_train_batch, classes=classes, epochs=1)

    y_train_pred = c_tsetlin_clf.predict(X_train)
    y_test_pred = c_tsetlin_clf.predict(X_test)

    print(
        f"Batch {start_idx // batch_size + 1}: "
        f"Train Accuracy: {accuracy_score(y_train, y_train_pred):.4f}, "
        f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}"
    )

Classes: [0 1]
Batch 1: Train Accuracy: 0.5413, Test Accuracy: 0.5350
Batch 2: Train Accuracy: 0.5212, Test Accuracy: 0.4850
Batch 3: Train Accuracy: 0.4725, Test Accuracy: 0.4700
Batch 4: Train Accuracy: 0.4850, Test Accuracy: 0.4950
Batch 5: Train Accuracy: 0.5000, Test Accuracy: 0.5200
Batch 6: Train Accuracy: 0.5500, Test Accuracy: 0.5450
Batch 7: Train Accuracy: 0.4763, Test Accuracy: 0.4700
Batch 8: Train Accuracy: 0.5625, Test Accuracy: 0.5350
Batch 9: Train Accuracy: 0.5813, Test Accuracy: 0.6050
Batch 10: Train Accuracy: 0.5663, Test Accuracy: 0.5600
Batch 11: Train Accuracy: 0.5837, Test Accuracy: 0.5100
Batch 12: Train Accuracy: 0.4838, Test Accuracy: 0.4800
Batch 13: Train Accuracy: 0.5950, Test Accuracy: 0.5750
Batch 14: Train Accuracy: 0.6012, Test Accuracy: 0.5850
Batch 15: Train Accuracy: 0.5825, Test Accuracy: 0.5650
Batch 16: Train Accuracy: 0.5875, Test Accuracy: 0.5800
Batch 17: Train Accuracy: 0.5138, Test Accuracy: 0.5050
Batch 18: Train Accuracy: 0.5750, Test Acc