In [1]:
from warnings import filterwarnings
filterwarnings(action='ignore', category=FutureWarning, )

In [2]:
import pandas as pd

IRIS = '/kaggle/input/iris-dataset/iris.csv'

df = pd.read_csv(filepath_or_buffer=IRIS)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


We only have a few columns of data, so let's visualize all the pairs of features as scatter plots.

In [3]:
from plotly.express import scatter
for x_index in range(0, 3):
    x = df.columns[x_index]
    for y_index in range(x_index+1, 4):
        y = df.columns[y_index]
        scatter(data_frame=df, x=x, y=y, color='species').show()

All of our plots suggest that our species cluster together with some overlap between versicolor and virginica.

Let's introduce some dimnension reduction to see if the varieties cluster based on the available information.

In [4]:
from umap import UMAP
from arrow import now

time_start = now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=200)
plot_df = pd.concat(objs=[df, pd.DataFrame(data=umap.fit_transform(X=df.drop(columns=['species'])), columns=['x', 'y'])], axis=1)
print('done with UMAP in {}'.format(now() - time_start))

2024-02-21 16:19:38.840667: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-21 16:19:38.840813: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-21 16:19:39.023538: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(low_memory=False, n_epochs=200, n_jobs=1, random_state=2024, verbose=True)
Wed Feb 21 16:19:56 2024 Construct fuzzy simplicial set
Wed Feb 21 16:19:56 2024 Finding Nearest Neighbors
Wed Feb 21 16:20:01 2024 Finished Nearest Neighbor Search
Wed Feb 21 16:20:05 2024 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Wed Feb 21 16:20:06 2024 Finished embedding
done with UMAP in 0:00:10.279406


In [5]:
scatter(data_frame=plot_df, x='x', y='y', color='species')

This is really encouraging; we have a handful of tough cases, but otherwise the three species cluster pretty tightly. Let's build a model.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['species']), df['species'], test_size=0.25, random_state=2024)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print('accuracy: {:5.2f} pct'.format(100 * accuracy_score(y_test, model.predict(X_test))))

accuracy: 92.11 pct


In [7]:
from sklearn.metrics import classification_report
print(classification_report(y_true = y_test, y_pred = model.predict(X_test)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        14
  versicolor       0.83      0.91      0.87        11
   virginica       0.92      0.85      0.88        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.92        38
weighted avg       0.92      0.92      0.92        38



Not surprisingly we get the setosa right and have some issues with the other two species.

Let's try using a subset of our data and see if we can get better accuracy.

In [8]:
def get_accuracy(x: str, y: str) -> float:
    Xs_train, Xs_test, ys_train, ys_test = train_test_split(df[[x, y]], df['species'], test_size=0.25, random_state=2024)
    acc_model = LogisticRegression(max_iter=1000)
    acc_model.fit(Xs_train, ys_train)
    return accuracy_score(ys_test, acc_model.predict(Xs_test))

for x_index in range(0, 3):
    x = df.columns[x_index]
    for y_index in range(x_index+1, 4):
        y = df.columns[y_index]
        print('{} {} {:5.2f}'.format(x, y, get_accuracy(x=x, y=y)))

sepal_length sepal_width  0.79
sepal_length petal_length  0.87
sepal_length petal_width  0.97
sepal_width petal_length  0.87
sepal_width petal_width  1.00
petal_length petal_width  0.89


It turns out we can actually get 100% accuracy by choosing our data subset judiciously.

In [9]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(df[['sepal_width', 'petal_width']], df['species'], test_size=0.25, random_state=2024)
f_model = LogisticRegression(max_iter=1000)
f_model.fit(Xf_train, yf_train)
print(classification_report(y_true = yf_test, y_pred = f_model.predict(Xf_test)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        14
  versicolor       1.00      1.00      1.00        11
   virginica       1.00      1.00      1.00        13

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

