In [1]:
import pandas as pd

IRIS = '/kaggle/input/iris-datset/IRIS.csv'
df  = pd.read_csv(filepath_or_buffer=IRIS)
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.2,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


Let's use dimensionality reduction to plot our whole dataset.

In [2]:
from sklearn.manifold import TSNE

RANDOM_STATE = 2025
reducer = TSNE(random_state=RANDOM_STATE)
plot_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]))
plot_df['species'] = df['species'].tolist()

In [3]:
from plotly import express
from plotly.offline import init_notebook_mode

init_notebook_mode(connected=True)
express.scatter(data_frame=plot_df, x='x', y='y', color='species').show(renderer='iframe_connected',)

What do we see? We see that Setosa irises are very different from the other two species, but we have some hard cases between the other two species. Let's build a model.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], df['species'], test_size=0.2, random_state=2024, stratify=df['species'])
logreg = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 103 iterations
accuracy: 0.9667
f1: 0.9666
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       0.91      1.00      0.95        10
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30



Let's visualize our model results with model probabilities.

In [5]:
reducer_test = TSNE(random_state=RANDOM_STATE, perplexity=9)
plot_test_df = pd.DataFrame(columns=['x', 'y'], data=reducer_test.fit_transform(X=X_test))
plot_test_df['model probability'] = logreg.predict_proba(X=X_test).max(axis=1)
plot_test_df['true'] = y_test.tolist()
plot_test_df['predicted'] = logreg.predict(X=X_test).tolist()
plot_test_df['correct'] = plot_test_df['true'] == plot_test_df['predicted']
express.scatter(data_frame=plot_test_df, x='x', y='y', color='model probability', hover_name='true', hover_data=['predicted']).show(renderer='iframe_connected',)

In [6]:
plot_test_df.sort_values(by=['correct', 'model probability'], ascending=True).head(n=5)

Unnamed: 0,x,y,model probability,true,predicted,correct
3,10.422758,9.499957,0.612915,Iris-virginica,Iris-versicolor,False
14,12.989719,10.54112,0.579409,Iris-versicolor,Iris-versicolor,True
21,9.666318,6.967516,0.694709,Iris-versicolor,Iris-versicolor,True
7,12.73192,9.756556,0.765967,Iris-versicolor,Iris-versicolor,True
4,12.098927,11.093219,0.807135,Iris-virginica,Iris-virginica,True


What do we see? We see that our only mis-prediction occurs when the model probability is low, but not when the model probability is the lowest.