## Data Gathering

In [1]:
import pandas as pd 
import numpy as np

In [2]:
df=pd.read_csv('iris.csv')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## EDA

In [3]:
df.isna().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

## Define x and y

In [5]:
x=df.drop(['species'],axis=1)
y=df['species']


## Preprocessing

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,LabelEncoder

In [7]:
num_pipe=Pipeline(steps=[('scaler',StandardScaler())])

In [8]:
x_pre=pd.DataFrame(num_pipe.fit_transform(x),columns=num_pipe.get_feature_names_out())
x_pre

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [10]:
cols=num_pipe.get_feature_names_out()
cols

array(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'],
      dtype=object)

In [11]:
import pickle

In [12]:
with open('pipe1.pkl','wb') as file1:
    pickle.dump(num_pipe,file1)

In [14]:
le=LabelEncoder()

In [15]:
y1=le.fit_transform(y)
y1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [16]:
with open('le.pkl','wb') as file2:
    pickle.dump(le,file2)

In [13]:
from sklearn.model_selection import train_test_split


In [25]:
x_train,x_test,y_train,y_test=train_test_split(x_pre,y1,test_size=0.2,random_state=21)

## Model

In [27]:
from sklearn.neighbors import KNeighborsClassifier
kn=KNeighborsClassifier()

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
grid={'n_neighbors':[2,3,4,5,6,7,8]}

In [30]:
gs=GridSearchCV(kn,grid,cv=3)

In [31]:
gs.fit(x_train,y_train)

In [32]:
gs.best_params_

{'n_neighbors': 7}

In [33]:
best_kn=gs.best_estimator_

In [37]:
with open('model1','wb') as file3:
    pickle.dump(best_kn,file3)

In [34]:
best_kn

## Evaluation

In [35]:
from sklearn.metrics import accuracy_score,multilabel_confusion_matrix,classification_report


In [36]:
y_pred=best_kn.predict(x_test)

acc=accuracy_score(y_pred,y_test)
print('accuracy_score:',acc)

cnf=multilabel_confusion_matrix(y_pred,y_test)
print('multilabel_confusion_matrix:\n',cnf)

clf=classification_report(y_pred,y_test)
print('classification_report:\n',clf)

accuracy_score: 0.9
multilabel_confusion_matrix:
 [[[19  0]
  [ 0 11]]

 [[17  2]
  [ 1 10]]

 [[21  1]
  [ 2  6]]]
classification_report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.83      0.91      0.87        11
           2       0.86      0.75      0.80         8

    accuracy                           0.90        30
   macro avg       0.90      0.89      0.89        30
weighted avg       0.90      0.90      0.90        30



## Function

In [40]:
def predict_data(le,pipe,model):
    sepal_length=float(input('sepal_length:'))
    sepal_width=float(input('sepal_width:'))
    petal_length=float(input('petal_length:'))
    petal_width=float(input('petal_width:'))

    new_df=pd.DataFrame([sepal_length, sepal_width, petal_length, petal_width]).T
    new_df.columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

    print('Data before preprocessing:\n',new_df)

    x_pre=pd.DataFrame(pipe.transform(new_df))
    x_pre.columns=['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

    print('data after preprocessing:\n',x_pre)

    pred=model.predict(x_pre)

    pred_lb=le.inverse_transform(pred)[0]

    prob=model.predict_proba(x_pre).max()

    prediction=f'{pred_lb} with probability {prob:.4f} '

    print(prediction)

    return pred_lb,prob

In [41]:
predict_data(le,num_pipe,best_kn)

Data before preprocessing:
    sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
data after preprocessing:
    sepal_length  sepal_width  petal_length  petal_width
0     -0.900681     1.019004     -1.340227    -1.315444
setosa with probability 1.0000 


('setosa', 1.0)