# Prediction on Glass dataset using KNeighborsClassifier

In [1]:
from pandas import read_csv
import pandas as pd
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
#Loading the Glass dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
column = ['Id_Number','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
dataset = pd.read_csv(url, names = column)
dataset.head()

Unnamed: 0,Id_Number,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,2,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [3]:
#creating testing and training datasets
x = dataset.iloc[: , 1:10]
y = dataset.iloc[: , 10:]
print(x.head(),y.head())

        RI     Na    Mg    Al     Si     K    Ca   Ba   Fe
0  1.52101  13.64  4.49  1.10  71.78  0.06  8.75  0.0  0.0
1  1.51761  13.89  3.60  1.36  72.73  0.48  7.83  0.0  0.0
2  1.51618  13.53  3.55  1.54  72.99  0.39  7.78  0.0  0.0
3  1.51766  13.21  3.69  1.29  72.61  0.57  8.22  0.0  0.0
4  1.51742  13.27  3.62  1.24  73.08  0.55  8.07  0.0  0.0    Type
0     1
1     1
2     1
3     1
4     1


In [4]:
#creating gropus of training and testing
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=0.4,random_state=3)
print(x_test.head(),y_train.head())

          RI     Na    Mg    Al     Si     K     Ca   Ba    Fe
146  1.51769  13.65  3.66  1.11  72.77  0.11   8.60  0.0  0.00
25   1.51764  12.98  3.54  1.21  73.00  0.65   8.53  0.0  0.00
163  1.51514  14.01  2.68  3.50  69.89  1.68   5.87  2.2  0.00
108  1.52222  14.43  0.00  1.00  72.67  0.10  11.52  0.0  0.08
83   1.51594  13.09  3.52  1.55  72.87  0.68   8.05  0.0  0.09      Type
24      1
89      2
103     2
8       1
53      1


In [8]:
#selecting the KNeighborsClassifier model and training the dataset
model = KNeighborsClassifier(n_neighbors = 5)
model.fit(x_train,y_train)
pred = model.predict(x_test)
pred

  This is separate from the ipykernel package so we can avoid doing imports until


array([1, 1, 7, 5, 2, 7, 7, 2, 1, 7, 2, 2, 1, 1, 5, 5, 2, 7, 1, 2, 7, 2,
       2, 1, 1, 2, 7, 1, 1, 5, 1, 2, 5, 2, 1, 1, 2, 6, 6, 1, 6, 2, 2, 1,
       7, 1, 1, 1, 7, 5, 1, 2, 1, 1, 1, 7, 1, 1, 1, 2, 2, 2, 1, 5, 1, 1,
       2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 7, 2, 7, 5, 2, 2, 1],
      dtype=int64)

In [9]:
#looking at the confusion matrix
confusion_matrix(y_test,pred)

array([[21,  4,  0,  0,  0,  0],
       [ 8, 22,  0,  3,  1,  0],
       [ 6,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  4,  0,  1],
       [ 0,  0,  0,  1,  1,  0],
       [ 1,  1,  0,  0,  1, 11]], dtype=int64)

In [10]:
#Looking at the accuracy of our model
accuracy_score(y_test,pred)

0.686046511627907

In [12]:
#Now checking whose values effect the class of the iris
pred1 = model.predict([[1.52725,13.80,3.15,0.66,70.57,0.08,11.64,0.0,0.00]])
pred1

array([1], dtype=int64)

In [15]:
f1 = x_train.tail()
f2 = y_test.tail()
f1,f2

(          RI     Na    Mg    Al     Si     K     Ca    Ba    Fe
 200  1.51508  15.15  0.00  2.25  73.50  0.00   8.34  0.63  0.00
 184  1.51115  17.38  0.00  0.34  75.41  0.00   6.65  0.00  0.00
 131  1.52614  13.70  0.00  1.36  71.24  0.19  13.44  0.00  0.10
 152  1.51779  13.64  3.65  0.65  73.00  0.06   8.93  0.00  0.00
 106  1.53125  10.73  0.00  2.10  69.81  0.58  13.30  3.15  0.28,      Type
 193     7
 130     2
 118     2
 122     2
 78      2)

In [16]:
pred2 = model.predict(f1)
pred2

array([7, 7, 2, 1, 2], dtype=int64)