In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Dec 11 18:30:49 2017

@author: lphan
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('dataset.csv')
print (dataset.columns)
# replace 0-value by numpy NaN
dataset = dataset.replace(0, np.NaN)

# importing the dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

# Data preprocessing:
# replace the missing data (value 0.0) with mean value of all other existing data
from sklearn.preprocessing import Imputer

imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(X[:, 1:3])

X[:, 1:3] = imputer.transform(X[:, 1:3])

# encoding categorical data (this case: color in column 0)
# encoding the independent variable 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

onehotencoder = OneHotEncoder(categorical_features= [0])
X = onehotencoder.fit_transform(X).toarray()

# encoding the dependent variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

# splitting the dataset into the training set and test set
# from sklearn.cross_validation import train_test_split (deprecated)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

# Feature scaling - Normalization
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

X_vis = X_train[:, 3:5]
print (type(X_vis))

# Fitting to the training set using k-NN with k=3
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=3, metric='euclidean', p=2)
classifier.fit(X_train, y_train)

# Predicting the test set results
y_pred = classifier.predict(X_test)

# Find total errors/ correct by using confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print ("Confusion matrix\n", cm)
print ("Predicted value y_pred", y_pred)
print ("Actual value y_test", y_test)


Index(['Color', 'Radius (cm)', 'Weight (grams)', 'Fruit (class)'], dtype='object')
<class 'numpy.ndarray'>
Confusion matrix
 [[0 1 0]
 [0 3 0]
 [0 0 3]]
Predicted value y_pred [1 1 1 2 2 2 1]
Actual value y_test [1 1 0 2 2 2 1]


In [2]:
# ------------- Predicting the input set with k=1 -------------------------
classifier_k1 = KNeighborsClassifier(n_neighbors=1, metric='euclidean', p=2)
classifier_k1.fit(X_train, y_train)

# Mission: classify the inputdata into the right labels using above classifier k=1 & k=3 & visualize them 
inputdata = pd.read_csv('classify_data.txt', sep='\s+', 
                        names=['Color', 'Radius (cm)', 'Weight (grams)'])

# importing data into object
X_input = inputdata.iloc[:, [0,1,2]].values

# preprocessing data
labelencoder_X_input = LabelEncoder()
X_input[:, 0] = labelencoder_X_input.fit_transform(X_input[:, 0])

# onehotencoder = OneHotEncoder(categorical_features= [0])
X_input = onehotencoder.fit_transform(X_input).toarray()

# feature scaling 
X_input = sc_X.transform(X_input)

print (X_input)

# Predicting the input set results
y_pred_input_k3 = classifier.predict(X_input)
print ("Predicted value for input set k=3", y_pred_input_k3)

y_pred_input_k1 = classifier_k1.predict(X_input)
print ("Predicted value for input set k=1", y_pred_input_k1)

[[ 1.64750894 -0.36115756 -1.26491106 -0.17956959  0.15577048]
 [-0.60697698 -0.36115756  0.79056942 -0.52041043 -0.97767191]
 [ 1.64750894 -0.36115756 -1.26491106  0.95656654 -0.00614986]
 [-0.60697698  2.76887462 -1.26491106 -0.52041043 -0.81575157]
 [-0.60697698 -0.36115756  0.79056942 -0.97486489 -0.84813564]]
Predicted value for input set k=3 [0 1 0 0 1]
Predicted value for input set k=1 [0 1 2 0 1]


In [1]:
# Code part from Udemy course 
# Visualising the Test set results
# from matplotlib.colors import ListedColormap
# X_set, y_set = X_test, y_test
# X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
#                      np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
# plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
#              alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
# plt.xlim(X1.min(), X1.max())
# plt.ylim(X2.min(), X2.max())
# for i, j in enumerate(np.unique(y_set)):
#     plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
#                 c = ListedColormap(('red', 'green', 'blue'))(i), label = j)
# plt.title('KNN (Fruit Classification) ')
# plt.xlabel('Radius')
# plt.ylabel('Weight')
# plt.legend()
# plt.show()
