In [1]:
import warnings
from plotly import express

warnings.filterwarnings(action='ignore', category=FutureWarning)

In [2]:
import pandas as pd

IRIS  = '/kaggle/input/iris-dataset/iris.csv'
df = pd.read_csv(filepath_or_buffer=IRIS)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
from umap import UMAP
from arrow import now

target = 'species'
columns = [column for column in df.columns if column != target]

time_start = now()
reducer = UMAP(n_components=2, random_state=2024, transform_seed=2024, verbose=True, n_jobs=1, n_epochs=100)
df[['x', 'y']] = pd.DataFrame(data=reducer.fit_transform(X=df[columns]))
express.scatter(data_frame=df.reset_index(), x='x', y='y', color=target, hover_name='index', height=800, ).show()
print('UMAP done in {}'.format(now() - time_start))

2024-03-15 18:00:56.334212: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-15 18:00:56.334427: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-15 18:00:56.519186: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


UMAP(n_epochs=100, n_jobs=1, random_state=2024, transform_seed=2024, verbose=True)
Fri Mar 15 18:01:12 2024 Construct fuzzy simplicial set
Fri Mar 15 18:01:12 2024 Finding Nearest Neighbors
Fri Mar 15 18:01:17 2024 Finished Nearest Neighbor Search
Fri Mar 15 18:01:21 2024 Construct embedding


Epochs completed:   0%|            0/100 [00:00]

	completed  0  /  100 epochs
	completed  10  /  100 epochs
	completed  20  /  100 epochs
	completed  30  /  100 epochs
	completed  40  /  100 epochs
	completed  50  /  100 epochs
	completed  60  /  100 epochs
	completed  70  /  100 epochs
	completed  80  /  100 epochs
	completed  90  /  100 epochs
Fri Mar 15 18:01:22 2024 Finished embedding


UMAP done in 0:00:12.514162


This is good news; for the most part our three species cluster together and are separated from the other species when we look at them using dimension reduction. Let's build a simple model.

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from arrow import now

time_start = now()
X_train, X_test, y_train, y_test = train_test_split(df[columns], df[target], test_size=0.2, random_state=2024, stratify=df[target])

regression = LogisticRegression(max_iter=1000, tol=1e-6)
regression.fit(X=X_train, y=y_train)
print('fit complete after {} iterations.'.format(regression.n_iter_[0]))
print('accuracy: {:5.4f}'.format(regression.score(X=X_test, y=y_test)))
express.histogram(x=columns, y=regression.coef_[0]).show()
print('done in {}'.format(now() - time_start))

fit complete after 102 iterations.
accuracy: 0.9667


done in 0:00:00.166883


This looks good; let's look a little deeper.

In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_true=y_test, y_pred=regression.predict(X=X_test)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       0.91      1.00      0.95        10
   virginica       1.00      0.90      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



In [7]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_true=y_test, y_pred=regression.predict(X=X_test)))

[[10  0  0]
 [ 0 10  0]
 [ 0  1  9]]


We have misclassified just one instance. 

In [8]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

qda = QuadraticDiscriminantAnalysis()
qda.fit(X=X_train, y=y_train)
print('accuracy: {:5.4f}'.format(qda.score(X=X_test, y=y_test)))
print('done in {}'.format(now() - time_start))


accuracy: 1.0000
done in 0:00:00.338473


If instead we use QDA we get all the predictions correct.

In [9]:
print(classification_report(y_true=y_test, y_pred=qda.predict(X=X_test)))
print(confusion_matrix(y_true=y_test, y_pred=qda.predict(X=X_test)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      1.00      1.00        10
   virginica       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

[[10  0  0]
 [ 0 10  0]
 [ 0  0 10]]
