# Multiclass Classification: Iris Flowers

In [25]:
'''
Import modules that will be used in the project
'''
import pandas as pd

'''
Pandas allows us to read csv files and place them into a dataframe.
You may get your datasets in csv format or other formats from Kaggle.
'''
df = pd.read_csv("Iris.csv")

'''
Show dataframe
'''
df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [27]:
'''
Import train test split method from scikit-learn
'''
from sklearn.model_selection import train_test_split

In [None]:
'''
Split the data
'''

X = df.drop(columns=["Id","Species"]) # features
y = df["Species"] # what needs to be classified

# Split into 20% test and 80% training plus validation data (temp)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.25, random_state=1, stratify=y)
# random state is a seed for shuffling data
# stratify=y means the split is done proportionally

# Now split temp data into training (75%) and validation (25%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp)

# training data
# testing data
# validation data


In [29]:
X_test.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
113,5.7,2.5,5.0,2.0
7,5.0,3.4,1.5,0.2
0,5.1,3.5,1.4,0.2
11,4.8,3.4,1.6,0.2
93,5.0,2.3,3.3,1.0


In [30]:
X_train.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
61,5.9,3.0,4.2,1.5
41,4.5,2.3,1.3,0.3
25,5.0,3.0,1.6,0.2
72,6.3,2.5,4.9,1.5
33,5.5,4.2,1.4,0.2


In [31]:
X_val.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
117,7.7,3.8,6.7,2.2
67,5.8,2.7,4.1,1.0
64,5.6,2.9,3.6,1.3
27,5.2,3.5,1.5,0.2
30,4.8,3.1,1.6,0.2


In [32]:
'''
Turn target value numeric
'''

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(y_temp)

y_train = le.transform(y_train)
y_val = le.transform(y_val)
y_test = le.transform(y_test)

In [None]:
'''
Import ML Models
'''
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

In [34]:
'''
Train ML Models
'''

models = {
    "LogReg" : LogisticRegression(max_iter=200),
    "SVM" : SVC(),
    "KNN" : KNeighborsClassifier(),
    "DecTree" : DecisionTreeClassifier(),
    "RandForest" : RandomForestClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    print(f"{name} Validation Accuracy: {round(acc,4)}")

LogReg Validation Accuracy: 0.8214
SVM Validation Accuracy: 0.8214
KNN Validation Accuracy: 0.9286
DecTree Validation Accuracy: 0.8571
RandForest Validation Accuracy: 0.8571


In [35]:
'''
Pick Best Model & Find Best Combination
'''

from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_neighbors" : [3,5,7,9],
    "weights" : ["uniform","distance"],
    "p" : [1,2]
}

grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)

print(f"Improved accuracy: {grid.best_score_}")

Improved accuracy: 0.9882352941176471


In [36]:
best_knn = grid.best_estimator_

y_test_pred = best_knn.predict(X_test)

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {round(test_accuracy,4)}")

print()
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

Test Accuracy: 0.9737

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        12
Iris-versicolor       0.93      1.00      0.96        13
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.97        38
      macro avg       0.98      0.97      0.97        38
   weighted avg       0.98      0.97      0.97        38

