# Continuous data : Tsetlin Machine trains on Iris dataset 

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import green_tsetlin as gt
import numpy as np

Load the **iris** dataset from sklearn.

In [31]:
iris = load_iris()

x = iris['data'].astype(np.uint8)
y = iris['target'].astype(np.uint32)

As features of this dataset is continuous, we will need to convert it to TM friendly binary data.

In [32]:
# get max and min for each feature
x_max = np.max(x, axis=0)
x_min = np.min(x, axis=0)

# each feature will have 4 intervals
intervals = (x_max - x_min) / 4

# Create 4 intervals for each feature
intervals_list = [[x_min[i] + k * intervals[i] for k in range(4)] for i in range(x.shape[1])]

# Create 4 bit representation for each feature
x_empty = np.zeros((x.shape[0], x.shape[1] * 4)).astype(np.uint8)

for i in range(x.shape[0]):
    for j in range(x.shape[1]):
        x_empty[i, j*4:(j+1)*4] = np.array([1 if x[i, j] >= intervals_list[j][k] else 0 for k in range(4)])

x = x_empty

In [33]:
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size=0.2, random_state=42)

In [36]:
from green_tsetlin.hpsearch import HyperparameterSearch

hpsearch = HyperparameterSearch(s_space=(2.0, 30.0),
                                clause_space=(100, 1000),
                                threshold_space=(50, 1500),
                                max_epoch_per_trial=30,
                                literal_budget=(5, 10),
                                k_folds=4,
                                n_jobs=5,
                                seed=42,
                                minimize_literal_budget=False)

hpsearch.set_train_data(train_x, train_y)
hpsearch.set_eval_data(val_x, val_y)

hpsearch.optimize(n_trials=10, 
                study_name="IRIS hpsearch", 
                show_progress_bar=True, 
                storage=None)

[I 2024-05-10 19:27:58,214] A new study created in memory with name: IRIS hpsearch
Processing trial 9 of 10, best score: [1.0]: 100%|██████████| 10/10 [00:40<00:00,  4.07s/it]


In [37]:
params = hpsearch.best_trials[0].params
performance = hpsearch.best_trials[0].values

print("best paramaters: ", params)
print("best score: ", performance)

best paramaters:  {'s': 2.7694421614854194, 'n_clauses': 514, 'threshold': 501.0437283947865, 'literal_budget': 7}
best score:  [1.0]
