In [1]:
import pandas as pd 
import numpy as np


In [2]:
# https://stackoverflow.com/questions/1401712/how-can-the-euclidean-distance-be-calculated-with-numpy
def nearestNeighbor(idx, data):
    closest = None
    for i in range(data.shape[0]):

        # This is basically the base case, it had to be hardcoded to prevent KeyErrors
        if closest == None and i != idx: closest = i

        # Make sure we don't keep marking the closest as itself
        elif i != idx:

            # If the current vector is closer than "closest", then update closest
            if np.linalg.norm(data.vector[i] - data.vector[idx]) < np.linalg.norm(data.vector[closest] - data.vector[idx]): closest = i

    # Return the classification of "closest"
    return data.classification[closest]

def accuracy(data):

    # Running total of correct predictions
    correctPredictions = 0

    for i in range(data.shape[0]):

        # if the closet neighbor has the same classification as the god given classification, then increment "correcPredictions" by 1
        if nearestNeighbor(i, data) == data.classification[i]: correctPredictions += 1
    
    # Return the total number of correct predictions divided by the total number of pieces of data
    return correctPredictions / data.shape[0]



def forwardSearch(data):
    features = []
    maxAccuracy = 0
    bestFeatures = []

    for j in range(len(data.vector[0])):
        print('Checking to add', len(features), 'th feature')
        # initially look for the most effetive feature

        accuracies = []
        for i in range(len(data.vector[0])):
            
            if i not in features:
                print('Checking feature', i)

                new = data.copy(deep=True)

                new.vector = data.vector.map(lambda x: x[features + [i]])

                newacc = accuracy(new)
                accuracies.append(newacc)

                print("Accuracy of feature(s): ", (features + [i]), " : ", newacc * 100, "%")
            else: 
                accuracies.append((-1))

        if np.max(accuracies) < maxAccuracy: 
            features.append(np.argmax(accuracies))
            print("Using feature", np.argmax(accuracies) , " is the best option at this level.")
        else: 
            features.append(np.argmax(accuracies))
            maxAccuracy = np.max(accuracies)
            bestFeatures = features
            print("Using feature", np.argmax(accuracies) , " as well improves the accuracy!")

In [3]:
# https://archive.ics.uci.edu/ml/datasets/Rice+%28Cammeo+and+Osmancik%29
# https://www.muratkoklu.com/datasets/

df = pd.read_excel('./Rice_Osmancik_Cammeo_Dataset.xlsx')
df.head(10)

Unnamed: 0,AREA,PERIMETER,MAJORAXIS,MINORAXIS,ECCENTRICITY,CONVEX_AREA,EXTENT,CLASS
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896,Cammeo
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436,Cammeo
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259,Cammeo
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669,Cammeo
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024,Cammeo
5,13479,477.015991,200.053055,86.650291,0.901328,13786,0.657897,Cammeo
6,15757,509.281006,207.296677,98.336136,0.880323,16150,0.589708,Cammeo
7,16405,526.570007,221.612518,95.436707,0.902521,16837,0.658888,Cammeo
8,14534,483.640991,196.650818,95.050682,0.875429,14932,0.649651,Cammeo
9,13485,471.570007,198.272644,87.727287,0.896789,13734,0.57232,Cammeo


In [4]:
for column in df.columns:
    if column != 'CLASS' and column != 'vector':
        df[column] = (df[column] - df[column].mean()) / df[column].std()

In [5]:
df

Unnamed: 0,AREA,PERIMETER,MAJORAXIS,MINORAXIS,ECCENTRICITY,CONVEX_AREA,EXTENT,CLASS
0,1.479635,2.004091,2.348238,-0.212915,2.018073,1.499463,-1.152770,Cammeo
1,1.147720,1.125705,0.988261,0.945444,0.409964,1.192761,-0.602000,Cammeo
2,1.135020,1.317041,1.451718,0.253854,1.212797,1.126356,0.405558,Cammeo
3,0.293398,0.115285,0.261405,0.198025,0.239720,0.233826,-0.275315,Cammeo
4,1.166191,1.486858,1.316269,0.523351,0.952096,1.299685,-0.205986,Cammeo
...,...,...,...,...,...,...,...,...
3805,-0.708122,-1.078211,-1.048185,-0.097238,-1.085140,-0.745367,0.246999,Osmancik
3806,-0.601909,-0.922805,-1.207050,0.549550,-1.970472,-0.590047,0.418760,Osmancik
3807,-0.133186,-0.329808,-0.298206,0.085208,-0.275063,-0.173045,-0.455671,Osmancik
3808,-1.608046,-1.740092,-1.580764,-1.414228,-0.598743,-1.606945,-0.037163,Osmancik


In [6]:
vectors = []
for i in range(df.shape[0]):
    temp = []
    for column in df.columns:
        if column != 'CLASS' and column != 'vector':
            temp.append(df[column][i])
    vectors.append(np.array(temp))
vectors[3808]

array([-1.60804635, -1.74009162, -1.58076367, -1.41422844, -0.59874275,
       -1.60694529, -0.03716269])

In [7]:
data = pd.DataFrame(pd.Series(vectors))
data.columns = ['vector']
data['classification'] = df.CLASS
subset =  pd.concat( [data[data.classification == 'Cammeo'][0:250], data[data.classification == 'Osmancik'][0:250]] )
subset = subset.reset_index(drop=True)
subset

Unnamed: 0,vector,classification
0,"[1.4796353175814465, 2.004091247145826, 2.3482...",Cammeo
1,"[1.147719640566751, 1.12570533498003, 0.988260...",Cammeo
2,"[1.135020258141841, 1.3170413760998338, 1.4517...",Cammeo
3,"[0.29339755016370844, 0.11528492632972509, 0.2...",Cammeo
4,"[1.1661914695484383, 1.4868581887472714, 1.316...",Cammeo
...,...,...
495,"[-1.676161215008711, -1.777735051629186, -1.63...",Osmancik
496,"[-0.46856539533089697, -0.6285957345426955, -0...",Osmancik
497,"[-0.3508074855726398, -0.43442286911250305, -0...",Osmancik
498,"[-0.5210946589975706, -0.699613217837939, -0.8...",Osmancik


In [8]:
accuracy(subset)

0.892

In [9]:
forwardSearch(subset)

Checking to add 0 th feature
Checking feature 0
Accuracy of feature(s):  [0]  :  88.4 %
Checking feature 1
Accuracy of feature(s):  [1]  :  90.60000000000001 %
Checking feature 2
Accuracy of feature(s):  [2]  :  89.0 %
Checking feature 3
Accuracy of feature(s):  [3]  :  60.4 %
Checking feature 4
Accuracy of feature(s):  [4]  :  71.8 %
Checking feature 5
Accuracy of feature(s):  [5]  :  86.2 %
Checking feature 6
Accuracy of feature(s):  [6]  :  51.800000000000004 %
Using feature 1  as well improves the accuracy!
Checking to add 1 th feature
Checking feature 0
Accuracy of feature(s):  [1, 0]  :  88.6 %
Checking feature 2
Accuracy of feature(s):  [1, 2]  :  91.0 %
Checking feature 3
Accuracy of feature(s):  [1, 3]  :  90.4 %
Checking feature 4
Accuracy of feature(s):  [1, 4]  :  90.4 %
Checking feature 5
Accuracy of feature(s):  [1, 5]  :  89.2 %
Checking feature 6
Accuracy of feature(s):  [1, 6]  :  87.6 %
Using feature 2  as well improves the accuracy!
Checking to add 2 th feature
Check