# Question2: KNN Classification in sklearn

a- Read the iris dataset from the following URL: https://raw.githubusercontent.com/mpourhoma/CS4661/master/iris.csv and assign it to a Pandas DataFrame as you learned in tutorial Lab2-3.

b- Split the dataset into testing and training sets with the following parameters: test_size=0.4, random_state=6

c- Instantiate a KNN object with K=3, train it on the training set and test it on the testing set.Then, calculate the accuracy of your prediction as you learned in Lab3.

In [1]:
# The following line will import KNeighborsClassifier "Class"
# KNeighborsClassifier is name of a "sklearn class" to perform "KNN Classification" 
from sklearn.neighbors import KNeighborsClassifier

# Importing the required packages and libraries
# we will need numpy and pandas later
import numpy as np
import pandas as pd


# reading a CSV file directly from Web, and store it in a pandas DataFrame:
# "read_csv" is a pandas function to read csv files from web or local device:
iris_df = pd.read_csv('https://raw.githubusercontent.com/mpourhoma/CS4661/master/iris.csv')


# Importing iris from sklearn embedded datasets
# The following line only import the load_iris function from sklearn library. 
# This function can generate an object containing iris dataset 
from sklearn.datasets import load_iris 

# Running the sklearn function load_iris() to instantiate an "object" containing iris datset: 
iris = load_iris()

# "data" attribute will return the iris dataset features:
X = iris.data  # X will be feature matrix
# print(X)



In [2]:
# "feature_names" attribute will return the name of features:
print(iris.feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [3]:
print(X.shape) # this line print the size of iris.data (iris feature matrix)

(150, 4)


In [4]:
# "target" attribute will return the iris dataset labels 
# for the sklearn embedded iris dataset, the labels are already converted to numeric
y = iris.target  # y will be label vector
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [5]:
print(y.shape) # this line print the size of iris.target

(150,)


In [6]:
# "target_names" attribute will return the name of encoded labels: 0 = setosa, 1 = versicolor, 2 = virginica
print(iris.target_names)

['setosa' 'versicolor' 'virginica']


In [9]:
# In the following line, "knn" is instantiated as an "object" of KNeighborsClassifier "class". 
k = 1
knn = KNeighborsClassifier(n_neighbors=k) 

# We can use the method "fit" of the "object knn" along with training dataset and labels to train the model.
knn.fit(X, y)

# We can use the method "predict" of the *trained* object knn on one or more testing data sample to perform prediction:

X_Testing = [[6, 3, 5.9, 2.9]]

y_predict = knn.predict(X_Testing)

print(y_predict)


[2]


In [10]:
# We can use the method "predict" of the *trained* object knn on one or more testing data sample to perform prediction:
# Two new data samples:
X_Testing = [[6, 3, 5.9, 2.9],[3.2, 3, 1.9, 0.3]]

y_predict = knn.predict(X_Testing)

print(y_predict)

[2 0]


In [11]:
# Defining a function to convert "categorical" labels to "numerical" labels
# This is optional, because the latest revision of sklearn accepts non-numerical labels too!

def categorical_to_numeric(x):
    if x == 'setosa':
        return 0
    elif x == 'versicolor':
        return 1
    elif x == 'virginica':
        return 2

In [12]:
# Applying the function on species column and adding corrsponding numerical label column:
iris_df['label'] = iris_df['species'].apply(categorical_to_numeric)

# checking the dataset by printing every 10 lines:

iris_df[0::10]


# Creating the Feature Matrix for iris dataset:

# create a python list of feature names that would like to pick from the dataset:
feature_cols = ['sepal_length','sepal_width','petal_length','petal_width']

# use the above list to select the features from the original DataFrame
X = iris_df[feature_cols]  

# print the first 5 rows
X



Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


In [13]:
# checking the size of Feature Matix X:

print(X.shape)

(150, 4)


In [14]:
# select a Series of labels (the last column) from the DataFrame
# y = iris_df['label'] # this is the index that we gave to the labels
# OR:
y = iris_df['species'] # this is the original categorical labels (the latest revision of sklearn accepts non-numerical labels)

# checking the label vector by printing every 10 values
y[::10]

0          setosa
10         setosa
20         setosa
30         setosa
40         setosa
50     versicolor
60     versicolor
70     versicolor
80     versicolor
90     versicolor
100     virginica
110     virginica
120     virginica
130     virginica
140     virginica
Name: species, dtype: object

In [15]:
# Instantiating another "object" of KNeighborsClassifier "class" with k=3:
k = 3
my_knn_for_cs4661 = KNeighborsClassifier(n_neighbors=k) # name of the object is arbitrary!

# We use the method "fit" of the object along with training dataset and labels to train the model.
my_knn_for_cs4661.fit(X, y)


# We use the method "predict" of the *trained* object knn on one or more testing data sample to perform prediction:
# Prediction for Two new data samples:
X_Testing = [[6, 3, 5.9, 2.9],[3.2, 3, 1.9, 0.3]]
y_predict = my_knn_for_cs4661.predict(X_Testing)
print(y_predict)
print('\n')

# Randomly splitting the original dataset into training set and testing set
# The function"train_test_split" from "sklearn.cross_validation" library performs random splitting.
# "test_size=0.3" means that pick 30% of data samples for testing set, and the rest (70%) for training set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)

# print the size of the traning set:
print(X_train.shape)
print(y_train.shape)
print('\n')


# print the size of the testing set:
print(X_test.shape)
print(y_test.shape)
print('\n')



['virginica' 'setosa']


(90, 4)
(90,)


(60, 4)
(60,)




In [16]:
print(X_test)

     sepal_length  sepal_width  petal_length  petal_width
4             5.0          3.6           1.4          0.2
116           6.5          3.0           5.5          1.8
2             4.7          3.2           1.3          0.2
23            5.1          3.3           1.7          0.5
123           6.3          2.7           4.9          1.8
96            5.7          2.9           4.2          1.3
134           6.1          2.6           5.6          1.4
39            5.1          3.4           1.5          0.2
137           6.4          3.1           5.5          1.8
53            5.5          2.3           4.0          1.3
127           6.1          3.0           4.9          1.8
81            5.5          2.4           3.7          1.0
115           6.4          3.2           5.3          2.3
135           7.7          3.0           6.1          2.3
74            6.4          2.9           4.3          1.3
119           6.0          2.2           5.0          1.5
105           

In [17]:
print(y_test)

4          setosa
116     virginica
2          setosa
23         setosa
123     virginica
96     versicolor
134     virginica
39         setosa
137     virginica
53     versicolor
127     virginica
81     versicolor
115     virginica
135     virginica
74     versicolor
119     virginica
105     virginica
51     versicolor
92     versicolor
32         setosa
37         setosa
120     virginica
44         setosa
0          setosa
55     versicolor
72     versicolor
87     versicolor
102     virginica
30         setosa
93     versicolor
45         setosa
59     versicolor
16         setosa
13         setosa
133     virginica
128     virginica
64     versicolor
146     virginica
95     versicolor
49         setosa
17         setosa
103     virginica
71     versicolor
61     versicolor
46         setosa
12         setosa
52     versicolor
27         setosa
34         setosa
54     versicolor
118     virginica
117     virginica
121     virginica
6          setosa
111     virginica
18        

In [18]:
# Training ONLY on the training set:
my_knn_for_cs4661.fit(X_train, y_train)

# Testing on the testing set:
y_predict = my_knn_for_cs4661.predict(X_test)
print(y_predict)

['setosa' 'virginica' 'setosa' 'setosa' 'virginica' 'versicolor'
 'virginica' 'setosa' 'virginica' 'versicolor' 'virginica' 'versicolor'
 'virginica' 'virginica' 'versicolor' 'versicolor' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'virginica' 'setosa' 'setosa'
 'versicolor' 'versicolor' 'versicolor' 'virginica' 'setosa' 'versicolor'
 'setosa' 'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica'
 'versicolor' 'virginica' 'versicolor' 'setosa' 'setosa' 'virginica'
 'versicolor' 'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa'
 'setosa' 'versicolor' 'virginica' 'virginica' 'virginica' 'setosa'
 'virginica' 'setosa' 'setosa' 'setosa' 'versicolor' 'virginica']


In [19]:
# Function "accuracy_score" from "sklearn.metrics" will perform element-to-element comparision and returns the 
# percent of correct predictions:

from sklearn.metrics import accuracy_score

# Example:
y_pred    = [0, 2, 1, 1]
y_actual  = [0, 1, 2, 1]

score = accuracy_score(y_actual, y_pred)

print(score)

0.5


In [20]:
# We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
# Function "accuracy_score" from "sklearn.metrics" will perform the element-to-element comparision and returns the 
# portion of correct predictions:

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_predict)

print(accuracy)

0.9666666666666667


In [23]:
listK = [1,3,5,7,11,13,15,27,59,]
ScoreK = []
AccuracyK = []

for k in listK:
    my_knn_for_cs4661 = KNeighborsClassifier(n_neighbors=k) # name of the object is arbitrary!

    # We use the method "fit" of the object along with training dataset and labels to train the model.
    my_knn_for_cs4661.fit(X, y)


    # We use the method "predict" of the *trained* object knn on one or more testing data sample to perform prediction:
    # Prediction for Two new data samples:
    X_Testing = [[6, 3, 5.9, 2.9],[3.2, 3, 1.9, 0.3]]
    y_predict = my_knn_for_cs4661.predict(X_Testing)
        # print(y_predict)
        # print('\n')

    # Randomly splitting the original dataset into training set and testing set
    # The function"train_test_split" from "sklearn.cross_validation" library performs random splitting.
    # "test_size=0.3" means that pick 30% of data samples for testing set, and the rest (70%) for training set.
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)

    # print the size of the traning set:
        #print(X_train.shape)
        #print(y_train.shape)
        #print('\n')


    # print the size of the testing set:
        #print(X_test.shape)
        #print(y_test.shape)
        #print('\n')


        #print(X_test)
        #print('\n')
        #print(y_test)


    # Training ONLY on the training set:
    my_knn_for_cs4661.fit(X_train, y_train)

    # Testing on the testing set:
    y_predict = my_knn_for_cs4661.predict(X_test)
        #print(y_predict)

    # Function "accuracy_score" from "sklearn.metrics" will perform element-to-element comparision and returns the 
    # percent of correct predictions:
    from sklearn.metrics import accuracy_score
    # Example:
    y_pred    = [0, 2, 1, 1]
    y_actual  = [0, 1, 2, 1]
    score = accuracy_score(y_actual, y_pred)
    #print(score)
    ScoreK.append(score)


    # We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
    # Function "accuracy_score" from "sklearn.metrics" will perform the element-to-element comparision and returns the 
    # portion of correct predictions:
    from sklearn.metrics import accuracy_score
    accuracy = accuracy_score(y_test, y_predict)
    #print(accuracy)
    AccuracyK.append(accuracy)
print( 'K value :')    
print(listK)
print('\n')

print('Score : ')
print(ScoreK)
print('\n')

print('Accuracy : ')
print(AccuracyK)


K value :
[1, 3, 5, 7, 11, 13, 15, 27, 59]


Score : 
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]


Accuracy : 
[0.95, 0.9666666666666667, 0.9833333333333333, 0.9666666666666667, 0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9166666666666666, 0.8166666666666667]


In [None]:
ScoreK = []
AccuracyK = []

k=3

my_knn_for_cs4661 = KNeighborsClassifier(n_neighbors=k) # name of the object is arbitrary!

# We use the method "fit" of the object along with training dataset and labels to train the model.
my_knn_for_cs4661.fit(X, y)


# We use the method "predict" of the *trained* object knn on one or more testing data sample to perform prediction:
# Prediction for Two new data samples:
X_Testing = [[6, 3, 5.9, 2.9],[3.2, 3, 1.9, 0.3]]
y_predict = my_knn_for_cs4661.predict(X_Testing)
    # print(y_predict)
    # print('\n')

# Randomly splitting the original dataset into training set and testing set
# The function"train_test_split" from "sklearn.cross_validation" library performs random splitting.
# "test_size=0.3" means that pick 30% of data samples for testing set, and the rest (70%) for training set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=6)

# print the size of the traning set:
    #print(X_train.shape)
    #print(y_train.shape)
    #print('\n')


# print the size of the testing set:
    #print(X_test.shape)
    #print(y_test.shape)
    #print('\n')


    #print(X_test)
    #print('\n')
    #print(y_test)


# Training ONLY on the training set:
my_knn_for_cs4661.fit(X_train, y_train)

# Testing on the testing set:
y_predict = my_knn_for_cs4661.predict(X_test)
    #print(y_predict)

# Function "accuracy_score" from "sklearn.metrics" will perform element-to-element comparision and returns the 
# percent of correct predictions:
from sklearn.metrics import accuracy_score
# Example:
y_pred    = [0, 2, 1, 1]
y_actual  = [0, 1, 2, 1]
score = accuracy_score(y_actual, y_pred)
#print(score)
ScoreK.append(score)


# We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
# Function "accuracy_score" from "sklearn.metrics" will perform the element-to-element comparision and returns the 
# portion of correct predictions:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict)
#print(accuracy)
AccuracyK.append(accuracy)
print( 'K value :')    
print(listK)
print('\n')

print('Score : ')
print(ScoreK)
print('\n')

print('Accuracy : ')
print(AccuracyK)
