In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB

In [2]:
df = pd.read_csv("../DSBDAL_Exam_DataSets/Iris/Iris.csv", header=None)
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
df.columns = ['SepalLength','SepalWidth','PetalLength','PetalWidth','Species']
df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SepalLength  150 non-null    float64
 1   SepalWidth   150 non-null    float64
 2   PetalLength  150 non-null    float64
 3   PetalWidth   150 non-null    float64
 4   Species      150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.shape

(150, 5)

# Data Cleaning

In [6]:
df.isna().sum()

SepalLength    0
SepalWidth     0
PetalLength    0
PetalWidth     0
Species        0
dtype: int64

In [7]:
df[df == '?'].sum()

SepalLength    0.0
SepalWidth     0.0
PetalLength    0.0
PetalWidth     0.0
Species          0
dtype: object

In [8]:
df = df.replace('?', np.nan)
df.dropna(inplace=True)

# Outlier Detection

In [9]:
lb = LabelEncoder()
lb.fit(df['Species'])
lb.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [10]:
df['Species'] = lb.transform(df['Species'])
df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [11]:
out = abs((df - df.mean()) / df.std())
lier = out > 3
lier.sum()

SepalLength    0
SepalWidth     1
PetalLength    0
PetalWidth     0
Species        0
dtype: int64

In [12]:
df = df[~lier.any(axis=1)]
df

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [13]:
df.shape

(149, 5)

# Model

In [14]:
X = df[['SepalLength', 'SepalWidth', 'PetalLength', 'PetalWidth']]
Y = df['Species']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

# Logistic Regression

In [15]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, Y_train)

In [16]:
pred_lr = lr.predict(X_test)
pred_lr

array([1, 0, 2, 1, 2, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 2, 2, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [17]:
pred_lr_acc = accuracy_score(Y_test, pred_lr)
pred_lr_acc

0.9111111111111111

# Naive Bayes

In [18]:
m_NB = MultinomialNB()
m_NB.fit(X_train, Y_train)

In [19]:
g_NB = GaussianNB()
g_NB.fit(X_train, Y_train)

In [20]:
pred_mnb = m_NB.predict(X_test)
pred_mnb

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 2, 2, 1, 2, 0, 2,
       0, 2, 1, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [21]:
pred_mnb_acc = accuracy_score(Y_test, pred_mnb)
pred_mnb_acc

0.9333333333333333

In [22]:
pred_gnb = g_NB.predict(X_test)
pred_gnb

array([1, 0, 2, 1, 2, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 2, 1, 1, 0,
       0])

In [23]:
pred_gnb_acc = accuracy_score(Y_test, pred_gnb)
pred_gnb_acc

0.8666666666666667