# KNN
sources: 
- [knn for image classfication](https://pyimagesearch.com/2016/08/08/k-nn-classifier-for-image-classification/)
- [Color Histogram for Image Searcher](https://pyimagesearch.com/2014/12/01/complete-guide-building-image-search-engine-python-opencv/)

In [1]:
import numpy as np
import cv2
cv2.__version__

'4.5.5'

# Define Color Descriptor aka Color Histogram features

In [2]:
class ColorDescriptor:

    def __init__(self, bins):
		# store the number of bins for the 3D histogram
        self.bins = bins
    
    def histogram(self,image, mask):
        	# extract a 3D color histogram from the masked region of the
		# image, using the supplied number of bins per channel
        hist = cv2.calcHist([image], [0, 1, 2], mask, self.bins, [0, 256, 0, 256, 0, 256])
		# normalize the histogram 
        hist = cv2.normalize(hist, hist).flatten() # help the difference in image dimension
		# return the histogram
        return hist
    def describe(self, image):
		# convert the image to the HSV color space and initialize
		# the features used to quantify the image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
        features = []
		# grab the dimensions and compute the center of the image
        (h, w) = image.shape[:2]
        (cX, cY) = (int(w * 0.5), int(h * 0.5))

        # Divide image into 5 regions (top-left,
        # top-right, bottom-right, bottom-left, center oval)
        segments = [(0, cX, 0, cY), (cX, w, 0, cY), (cX, w, cY, h),
            (0, cX, cY, h)]
        # construct an elliptical mask representing the center of the
        # image
        (axesX, axesY) = (int(w * 0.75) // 2, int(h * 0.75) // 2)
        ellipMask = np.zeros(image.shape[:2], dtype = "uint8") # center
        cv2.ellipse(ellipMask, (cX, cY), (axesX, axesY), 0, 0, 360, 255, -1)
        
        # loop over the segments
        for (startX, endX, startY, endY) in segments:
            # construct a mask for each corner of the image, subtracting
            # the elliptical center from it
            cornerMask = np.zeros(image.shape[:2], dtype = "uint8")
            cv2.rectangle(cornerMask, (startX, startY), (endX, endY), 255, -1)
            cornerMask = cv2.subtract(cornerMask, ellipMask)
            # extract a color histogram from the image, then update the
            # feature vector
            hist = self.histogram(image, cornerMask)
            features.extend(hist)
        # extract a color histogram from the elliptical region and
        # update the feature vector
        hist = self.histogram(image, ellipMask)
        features.extend(hist)
        # return the feature vector
        return features



## Load Images

In [3]:
from PIL import Image
import numpy as np

def load_data(labelNames, root):    
    images = list()
    labels = list()
    descriptor = ColorDescriptor(bins=[8,8,8])
    
    for label in labelNames:
            # get image directory
            img_dir = os.path.join(root, f"{label}")
            
            for img in os.listdir(img_dir):

                img = np.array(Image.open(os.path.join(img_dir, img),'r'))

                feature = descriptor.describe(img)

                images.append(feature)
                labels.append(int(label))
                
    return (images, labels)

In [4]:
train_path = "/Users/lap11353-local/Desktop/ML/A2/Image_classification_data/split_binary-task/train"
trainX, trainY = load_data([0,1], train_path)

In [5]:
val_path = "/Users/lap11353-local/Desktop/ML/A2/Image_classification_data/split_binary-task/val"
valX, valY = load_data(["0", "1"], root= val_path)

In [6]:
np.asarray(trainX).shape

(16223, 2560)

# PCA

### transfrom training data

In [7]:
from sklearn.decomposition import PCA

pca = PCA(30) # we need 2 principal components.
pca.fit(trainX)
transformed_trainX = pca.transform(trainX)
print(np.sum(pca.explained_variance_ratio_) )

0.8634682675087776


### process testing data

In [8]:
transformed_valX = pca.transform(valX)

# Model KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=2,
	n_jobs=-1, metric="manhattan")
model.fit(transformed_trainX, trainY)
acc = model.score(transformed_valX, valY)
print("[INFO] Validation accuracy: {:.2f}%".format(acc * 100))


[INFO] Validation accuracy: 83.39%


In [10]:
acc = model.score(transformed_trainX, trainY)

print("[INFO] Train accuracy: {:.2f}%".format(acc * 100))


[INFO] Train accuracy: 91.44%


# Model SVM

### Training (5-fold)

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svclassifier = SVC()
scores = cross_val_score(svclassifier, transformed_trainX, trainY, cv=5)
scores

array([0.89029276, 0.88320493, 0.88197227, 0.88594328, 0.8905672 ])

### Testing

In [12]:
svclassifier.fit(transformed_trainX, trainY)
val_score = svclassifier.score(transformed_valX, valY)
val_score

0.8922849396105497

In [13]:
pred_y = svclassifier.predict(transformed_valX)
from sklearn.metrics import classification_report
print(classification_report(valY, pred_y))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      2643
           1       0.87      0.82      0.84      1414

    accuracy                           0.89      4057
   macro avg       0.89      0.88      0.88      4057
weighted avg       0.89      0.89      0.89      4057



# Model Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(transformed_trainX, trainY)
rfc.score(transformed_valX, valY)

0.8893270889820064

# Stacking Models

In [15]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

## Stack 3 models: RF, SVM, and KNN

In [30]:
from sklearn.ensemble import StackingClassifier
submodels = [("rfc",rfc), ("knn",model)] 
stack_model = StackingClassifier(estimators= submodels, final_estimator= lr, stack_method="predict_proba")
score = cross_val_score(stack_model,transformed_trainX,trainY,cv = 5,scoring = 'f1_micro')
print(score)
print("The accuracy score of is:",score.mean())

[0.8853621  0.8816641  0.88104777 0.87607891 0.88286067]
The accuracy score of is: 0.881402708140571


In [31]:
submodels = [("svc",svclassifier), ("knn",model)] 
stack_model = StackingClassifier(estimators= submodels, final_estimator= lr, stack_method="predict")
score = cross_val_score(stack_model,transformed_trainX,trainY,cv = 5,scoring = 'f1_micro')
print(score)
print("The accuracy score of is:",score.mean())

[0.89029276 0.88320493 0.88197227 0.88594328 0.8905672 ]
The accuracy score of is: 0.8863960869325662
