# K-NEAREST NEIGHBOURS FROM SCRATCH


Imports

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import random

#### Reading the files and shuffling 

Datasets contains duplicate elements. glass.csv contains 1, Concrete_Data_Yeh.csv contains 25. Duplicate keys removed because they causes wrong decisions and unstability.

In [2]:
df = pd.read_csv("glass.csv")
df = df.drop_duplicates() #removeing duplicate rows
df.drop(['Type'], 1)  # dropping the results

full_data = df.astype(float).values.tolist()  # converting data to list
random.shuffle(full_data)

df2 = pd.read_csv("Concrete_Data_Yeh.csv")
df2 = df2.drop_duplicates() #removeing duplicate rows
df2.drop(['csMPa'], 1)  # dropping the results
df2 = df2.drop_duplicates()

full_data_concrete = df2.astype(float).values.tolist()  # converting data to list
random.shuffle(full_data_concrete)

Finding max and min values in row for normalization calculation

In [3]:
def dataset_min_max(dataset):  # min and max finder
    minmax = list()
    for i in range(len(dataset[0])):
        col_values = [row[i] for row in dataset]
        value_min = min(col_values)
        value_max = max(col_values)
        minmax.append([value_min, value_max])
    return minmax

Applying normalization formula for rangeing values between 0-1

In [4]:
def normalization(dataset, min_max):
    for row in dataset:
        for i in range(len(row) - 1):
            row[i] = (row[i] - min_max[i][0]) / (min_max[i][1] - min_max[i][0])

Splits the data given number of folds. Returns all folds in in a list

In [5]:
def k_fold_split(dataset, k_fold): # splitting data into folds
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / k_fold)
    for _ in range(k_fold):
        fold = list()
        while len(fold) < fold_size:
            index = 0     #random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
            index += 1
        dataset_split.append(fold)
    return dataset_split


#### K-Nearest Neigbors Classification

We used Numpy's linear algebra norm for faster calculation. It performs better than standart euclidean distance
formula. In result they are same.
Counter.most_common function finds most common vote and function returns the vote

In [6]:
def k_nearest_neighbors(data, predict, k):
    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict)) # using numpy euclidian distance for faster calculation
            distances.append([euclidean_distance, group])

    votes = [i[1] for i in sorted(distances)[:k]]
    vote_result = Counter(votes).most_common(1)[0][0]
    return vote_result

#### Weighted K-Nearest Neigbors Classification

We used 1/distance method for creating weights. To avoid zero division error, if euclidean distance = 0, we assigned weight as 1 / 0.001.
We combined weights if they in same class, then result returned 

In [7]:
def weighted_k_nearest_neighbors(data, predict, k):

    distances = []
    for group in data:
        for features in data[group]:
            euclidean_distance = np.linalg.norm(np.array(features) - np.array(predict))
            #distances.append([1 / euclidean_distance, group])
            if(euclidean_distance == 0):
                distances.append([1/0.001, group])
            else:
                distances.append([1 / euclidean_distance, group])

    b = sorted(distances, key=lambda distances:distances[0], reverse=True)

    uniq = []
    array = b[:k]
    for i in array:
        if(len(uniq) == 0):
            uniq.append(i)
        else:
            for j in uniq:
                if (j[1] == i[1]):
                    j[0] += i[0]

            cond = i[1] in (k[1] for k in uniq)
            if(cond == False):
                uniq.append(i)

    c = sorted(uniq, key=lambda uniq:uniq[0], reverse=True)

    return c[0][1]

#### K-Nearest Neigbors Regression

Calculating the distances then according to distances, sums k closest points value and returns avg. of them

In [8]:
def k_nearest_neighbors_regression(data, predict, k):
    summary = 0
    distances = []
    for group in data:
        euclidean_distance = np.linalg.norm(np.array(group[:-1]) - np.array(predict))
        distances.append([euclidean_distance,group[-1]])

    b = sorted(distances, key=lambda distances:distances[0])
    for j in range(k):
        summary += b[j][1]

    return summary/k

#### Weighted K-Nearest Neigbors Regression

For this one, we used formula explained in class.
Sum of all weight x values / sum of all weights.
For sorting the 2d array, lambda used

In [9]:
def weighted_k_nearest_neighbors_regression(data, predict, k):
    sum1 = 0
    sum2 = 0
    distances = []
    for group in data:
        euclidean_distance = np.linalg.norm(np.array(group[:-1]) - np.array(predict))
        if(euclidean_distance == 0):
            distances.append([0, group[-1]])
        else:
            distances.append([1 / euclidean_distance, group[-1]])
    b = sorted(distances, key=lambda distances:distances[0], reverse=True)

    for i in b[:k]:
        sum1 += i[0]*i[1]
        sum2 += i[0]

    return sum1/sum2

Calculates accuracy for classification problem

Prepares train and test datas then calls knn functions. According the results, accuracy created

In [10]:
def accuracy_k_fold(train, test, k, style):

    train_set = {1: [], 2: [], 3: [], 5: [], 6: [], 7: []}
    test_set = {1: [], 2: [], 3: [], 5: [], 6: [], 7: []}
    train_data = train
    test_data = test

    for i in train_data:
        train_set[i[-1]].append(i[:-1])

    for i in test_data:
        test_set[i[-1]].append(i[:-1])

    correct = 0
    total = 0

    for group in test_set:
        for data in test_set[group]:

            if(style == "knn"):
                vote = k_nearest_neighbors(train_set, data, k)
            if(style == "wknn"):
                vote = weighted_k_nearest_neighbors(train_set, data, k)
            if group == vote:
                correct += 1
            total += 1
    print('Accuracy:', (correct / total)*100)

    return (correct / total)*100

Calculates Mean Absulute Error for regression problems

In [11]:
def accuracy_regression(train, test, k, style):

    train_set = []
    test_set = []
    train_data = train[:]
    test_data = test[:]

    for i in train_data:
        train_set.append(i[:-1])

    for i in test_data:
        test_set.append(i[:-1])

    mae = 0
    total = 0
    for group in test_set:

        if(style == "knnr"):
            prediction = k_nearest_neighbors_regression(train_data, group, k)
        elif(style == "wknnr"):
            prediction = weighted_k_nearest_neighbors_regression(train_data, group, k)

        for i in test_data:
            if group == i[:-1]:
                mae += abs(prediction-i[-1])

        total += 1

    print('Mean Absolute Error:', mae / total)

    return mae / total

Splitting glass and concrete data into 5 folds

In [12]:
dataset_fold_class = k_fold_split(full_data, 5)
dataset_fold = k_fold_split(full_data_concrete, 5)

Function for printing results

In [13]:
def printer(dataset, k , style):
    acc = 0
    for i in range(0, len(dataset), 1):
        train = []
        test = list(dataset[i])

        for j in range(0, len(dataset), 1):

            if(j != i):
              train.extend(dataset[j])
        if(style == "knn"):
            acc += accuracy_k_fold(train,test,k,style)
        elif(style == "wknn"):
            acc += accuracy_k_fold(train,test,k,style)
        elif(style == "knnr"):
            acc += accuracy_regression(train,test,k,style)
        elif(style == "wknnr"):
            acc += accuracy_regression(train,test,k,style)

    if(style == "knn"):
        print("Avg. Accuracy:", acc/5)
    elif(style == "wknn"):
        print("Avg. Accuracy:", acc/5)
    elif(style == "knnr"):
        print("Avg. Mean Absolute Value:", acc/5)
    elif(style == "wknnr"):
        print("Avg. Mean Absolute Value:", acc/5)

## Accuracies and Mean Absolute Errors

"knn" = K Nearest Neighbour Classification

"wknn" = Weighted K Nearest Neighbour Classification

"knnr" = K Nearest Neighbour Regression

"wknnr" = weighted K Nearest Neighbour Regression

Finding accuracies and Mae's *without normalizaiton* with k = (1,3,5,7,9) 

**Calculations for k = 1**

In [14]:
printer(dataset_fold_class, 1, "knn")

Accuracy: 64.28571428571429
Accuracy: 78.57142857142857
Accuracy: 66.66666666666666
Accuracy: 88.09523809523809
Accuracy: 73.80952380952381
Avg. Accuracy: 74.28571428571428


In [15]:
printer(dataset_fold_class, 1, "wknn")

Accuracy: 64.28571428571429
Accuracy: 78.57142857142857
Accuracy: 66.66666666666666
Accuracy: 88.09523809523809
Accuracy: 73.80952380952381
Avg. Accuracy: 74.28571428571428


In [16]:
printer(dataset_fold, 1, "knnr")

Mean Absolute Error: 7.15363184079602
Mean Absolute Error: 7.166666666666664
Mean Absolute Error: 6.590099502487559
Mean Absolute Error: 6.207363184079603
Mean Absolute Error: 6.066218905472639
Avg. Mean Absolute Value: 6.636796019900497


In [17]:
printer(dataset_fold, 1, "wknnr")

Mean Absolute Error: 7.26601990049751
Mean Absolute Error: 7.0584577114427836
Mean Absolute Error: 6.6379104477611905
Mean Absolute Error: 6.400746268656717
Mean Absolute Error: 6.336666666666671
Avg. Mean Absolute Value: 6.739960199004974


**Calculations for k = 3**

In [18]:
printer(dataset_fold_class, 3, "knn")

Accuracy: 47.61904761904761
Accuracy: 66.66666666666666
Accuracy: 66.66666666666666
Accuracy: 85.71428571428571
Accuracy: 71.42857142857143
Avg. Accuracy: 67.61904761904762


In [19]:
printer(dataset_fold_class, 3, "wknn")

Accuracy: 47.61904761904761
Accuracy: 66.66666666666666
Accuracy: 66.66666666666666
Accuracy: 85.71428571428571
Accuracy: 71.42857142857143
Avg. Accuracy: 67.61904761904762


In [20]:
printer(dataset_fold, 3, "knnr")

Mean Absolute Error: 7.2536152570480965
Mean Absolute Error: 6.4629021558872335
Mean Absolute Error: 6.728689883913763
Mean Absolute Error: 6.594311774461033
Mean Absolute Error: 6.51932006633499
Avg. Mean Absolute Value: 6.711767827529023


In [21]:
printer(dataset_fold, 3, "wknnr")

Mean Absolute Error: 6.367784239404023
Mean Absolute Error: 5.610455710835063
Mean Absolute Error: 5.9366202182843555
Mean Absolute Error: 5.93089492166478
Mean Absolute Error: 5.585276399188421
Avg. Mean Absolute Value: 5.8862062978753285


**Calculations for k = 5**

In [22]:
printer(dataset_fold_class, 5, "knn")

Accuracy: 54.761904761904766
Accuracy: 71.42857142857143
Accuracy: 64.28571428571429
Accuracy: 83.33333333333334
Accuracy: 69.04761904761905
Avg. Accuracy: 68.57142857142858


In [23]:
printer(dataset_fold_class, 5, "wknn")

Accuracy: 54.761904761904766
Accuracy: 71.42857142857143
Accuracy: 66.66666666666666
Accuracy: 83.33333333333334
Accuracy: 69.04761904761905
Avg. Accuracy: 69.04761904761905


In [24]:
printer(dataset_fold, 5, "knnr")

Mean Absolute Error: 7.619074626865677
Mean Absolute Error: 6.998557213930351
Mean Absolute Error: 7.097661691542283
Mean Absolute Error: 7.036626865671644
Mean Absolute Error: 6.938159203980096
Avg. Mean Absolute Value: 7.13801592039801


In [25]:
printer(dataset_fold, 5, "wknnr")

Mean Absolute Error: 6.244278707252336
Mean Absolute Error: 5.832250965498885
Mean Absolute Error: 5.9374626499907235
Mean Absolute Error: 6.062919475129407
Mean Absolute Error: 5.529355892942306
Avg. Mean Absolute Value: 5.921253538162732


**Calculations for k = 7**

In [26]:
printer(dataset_fold_class, 7, "knn")

Accuracy: 52.38095238095239
Accuracy: 59.523809523809526
Accuracy: 61.904761904761905
Accuracy: 78.57142857142857
Accuracy: 69.04761904761905
Avg. Accuracy: 64.28571428571429


In [27]:
printer(dataset_fold_class, 7, "wknn")

Accuracy: 50.0
Accuracy: 69.04761904761905
Accuracy: 69.04761904761905
Accuracy: 80.95238095238095
Accuracy: 69.04761904761905
Avg. Accuracy: 67.61904761904762


In [28]:
printer(dataset_fold, 7, "knnr")

Mean Absolute Error: 7.816289978678039
Mean Absolute Error: 7.34589196872779
Mean Absolute Error: 7.319758351101637
Mean Absolute Error: 7.459140014214639
Mean Absolute Error: 7.041975835110168
Avg. Mean Absolute Value: 7.396611229566455


In [29]:
printer(dataset_fold, 7, "wknnr")

Mean Absolute Error: 6.370502173827124
Mean Absolute Error: 6.053027124875923
Mean Absolute Error: 5.8907716741300735
Mean Absolute Error: 6.140923532594241
Mean Absolute Error: 5.551164608682939
Avg. Mean Absolute Value: 6.00127782282206


**Calculations for k = 9**

In [30]:
printer(dataset_fold_class, 9, "knn")

Accuracy: 52.38095238095239
Accuracy: 57.14285714285714
Accuracy: 61.904761904761905
Accuracy: 78.57142857142857
Accuracy: 66.66666666666666
Avg. Accuracy: 63.33333333333333


In [31]:
printer(dataset_fold_class, 9, "wknn")

Accuracy: 52.38095238095239
Accuracy: 66.66666666666666
Accuracy: 66.66666666666666
Accuracy: 80.95238095238095
Accuracy: 66.66666666666666
Avg. Accuracy: 66.66666666666666


In [32]:
printer(dataset_fold, 9, "knnr")

Mean Absolute Error: 7.926346047540074
Mean Absolute Error: 7.508501934770591
Mean Absolute Error: 7.486003316749586
Mean Absolute Error: 7.74017689331122
Mean Absolute Error: 7.0831122166943015
Avg. Mean Absolute Value: 7.548828081813154


In [33]:
printer(dataset_fold, 9, "wknnr")

Mean Absolute Error: 6.389159048653838
Mean Absolute Error: 6.162160652818876
Mean Absolute Error: 6.017385382531999
Mean Absolute Error: 6.237356914946192
Mean Absolute Error: 5.563302821717378
Avg. Mean Absolute Value: 6.073872964133656


### Using normalization

In [34]:
normalization(full_data, dataset_min_max(full_data))
normalization(full_data_concrete, dataset_min_max(full_data_concrete))
dataset_fold_class2 = k_fold_split(full_data, 5)
dataset_fold2 = k_fold_split(full_data_concrete, 5)

Finding accuracies and Mae's *with normalizaiton* with k = (1,3,5,7,9) 

**Calculations for k = 1**

In [35]:
printer(dataset_fold_class2, 1, "knn")

Accuracy: 69.04761904761905
Accuracy: 69.04761904761905
Accuracy: 66.66666666666666
Accuracy: 83.33333333333334
Accuracy: 64.28571428571429
Avg. Accuracy: 70.47619047619047


In [36]:
printer(dataset_fold_class2, 1, "wknn")

Accuracy: 69.04761904761905
Accuracy: 69.04761904761905
Accuracy: 66.66666666666666
Accuracy: 83.33333333333334
Accuracy: 64.28571428571429
Avg. Accuracy: 70.47619047619047


In [37]:
printer(dataset_fold2, 1, "knnr")

Mean Absolute Error: 7.5875621890547285
Mean Absolute Error: 7.544776119402984
Mean Absolute Error: 7.059950248756216
Mean Absolute Error: 6.601542288557214
Mean Absolute Error: 6.517014925373137
Avg. Mean Absolute Value: 7.062169154228856


In [38]:
printer(dataset_fold2, 1, "wknnr")

Mean Absolute Error: 7.69995024875622
Mean Absolute Error: 7.436567164179104
Mean Absolute Error: 7.107761194029849
Mean Absolute Error: 6.79492537313433
Mean Absolute Error: 6.787462686567167
Avg. Mean Absolute Value: 7.165333333333334


**Calculations for k = 3**

In [39]:
printer(dataset_fold_class2, 3, "knn")

Accuracy: 59.523809523809526
Accuracy: 64.28571428571429
Accuracy: 64.28571428571429
Accuracy: 80.95238095238095
Accuracy: 66.66666666666666
Avg. Accuracy: 67.14285714285715


In [40]:
printer(dataset_fold_class2, 3, "wknn")

Accuracy: 61.904761904761905
Accuracy: 66.66666666666666
Accuracy: 64.28571428571429
Accuracy: 80.95238095238095
Accuracy: 66.66666666666666
Avg. Accuracy: 68.0952380952381


In [41]:
printer(dataset_fold2, 3, "knnr")

Mean Absolute Error: 7.14398009950249
Mean Absolute Error: 6.886799336650082
Mean Absolute Error: 7.116749585406299
Mean Absolute Error: 6.522752902155893
Mean Absolute Error: 6.947761194029851
Avg. Mean Absolute Value: 6.9236086235489225


In [42]:
printer(dataset_fold2, 3, "wknnr")

Mean Absolute Error: 6.431391718955963
Mean Absolute Error: 6.155144292055073
Mean Absolute Error: 6.343146645568277
Mean Absolute Error: 6.198692429037064
Mean Absolute Error: 6.0275146952337035
Avg. Mean Absolute Value: 6.231177956170017


**Calculations for k = 5**

In [43]:
printer(dataset_fold_class2, 5, "knn")

Accuracy: 61.904761904761905
Accuracy: 59.523809523809526
Accuracy: 66.66666666666666
Accuracy: 83.33333333333334
Accuracy: 69.04761904761905
Avg. Accuracy: 68.0952380952381


In [44]:
printer(dataset_fold_class2, 5, "wknn")

Accuracy: 61.904761904761905
Accuracy: 61.904761904761905
Accuracy: 66.66666666666666
Accuracy: 83.33333333333334
Accuracy: 69.04761904761905
Avg. Accuracy: 68.57142857142858


In [45]:
printer(dataset_fold2, 5, "knnr")

Mean Absolute Error: 7.365074626865673
Mean Absolute Error: 7.157064676616917
Mean Absolute Error: 7.433940298507463
Mean Absolute Error: 7.089203980099504
Mean Absolute Error: 7.222736318407959
Avg. Mean Absolute Value: 7.253603980099503


In [46]:
printer(dataset_fold2, 5, "wknnr")

Mean Absolute Error: 6.496945776991979
Mean Absolute Error: 6.235458011638986
Mean Absolute Error: 6.51792121473374
Mean Absolute Error: 6.453532976353351
Mean Absolute Error: 6.0310550779286
Avg. Mean Absolute Value: 6.346982611529331


**Calculations for k = 7**

In [47]:
printer(dataset_fold_class2, 7, "knn")

Accuracy: 54.761904761904766
Accuracy: 57.14285714285714
Accuracy: 66.66666666666666
Accuracy: 80.95238095238095
Accuracy: 71.42857142857143
Avg. Accuracy: 66.19047619047619


In [48]:
printer(dataset_fold_class2, 7, "wknn")

Accuracy: 54.761904761904766
Accuracy: 61.904761904761905
Accuracy: 69.04761904761905
Accuracy: 80.95238095238095
Accuracy: 71.42857142857143
Avg. Accuracy: 67.61904761904762


In [49]:
printer(dataset_fold2, 7, "knnr")

Mean Absolute Error: 7.763240938166311
Mean Absolute Error: 7.400952380952379
Mean Absolute Error: 7.516744847192608
Mean Absolute Error: 7.275785358919691
Mean Absolute Error: 7.228898365316275
Avg. Mean Absolute Value: 7.437124378109452


In [50]:
printer(dataset_fold2, 7, "wknnr")

Mean Absolute Error: 6.607155945085234
Mean Absolute Error: 6.349072054672267
Mean Absolute Error: 6.525355768572585
Mean Absolute Error: 6.407200791309725
Mean Absolute Error: 6.058511417770322
Avg. Mean Absolute Value: 6.389459195482027


**Calculations for k = 9**

In [51]:
printer(dataset_fold_class2, 9, "knn")

Accuracy: 52.38095238095239
Accuracy: 50.0
Accuracy: 57.14285714285714
Accuracy: 80.95238095238095
Accuracy: 69.04761904761905
Avg. Accuracy: 61.904761904761905


In [52]:
printer(dataset_fold_class2, 9, "wknn")

Accuracy: 57.14285714285714
Accuracy: 61.904761904761905
Accuracy: 59.523809523809526
Accuracy: 78.57142857142857
Accuracy: 69.04761904761905
Avg. Accuracy: 65.23809523809523


In [53]:
printer(dataset_fold2, 9, "knnr")

Mean Absolute Error: 8.160657822001108
Mean Absolute Error: 7.550160309563298
Mean Absolute Error: 7.571376451077941
Mean Absolute Error: 7.576495301271421
Mean Absolute Error: 7.273432835820896
Avg. Mean Absolute Value: 7.626424543946934


In [54]:
printer(dataset_fold2, 9, "wknnr")

Mean Absolute Error: 6.845825328152691
Mean Absolute Error: 6.421073055942643
Mean Absolute Error: 6.5968591644634795
Mean Absolute Error: 6.550139009907824
Mean Absolute Error: 6.021961675794203
Avg. Mean Absolute Value: 6.487171646852168


# Report

##### Introduction

In this assignment, we are expected to implement k nearest neighbor algorithm to predict glass type and concrete strenght . We experienced different types of KNN(Classification, Regression). We also extented our KNN algorithm as weighted
KNN.


*(N) = With Normalization*

#### Classification



| Style | k=1 | k=3| k=5 | k=7 | k=9 |
|----|----|----|----|----|----|
|K Nearest Neighbour Classification|74.28571428571428|67.61904761904762|68.57142857142858|64.28571428571429|63.33333333333333|
|Weighted K Nearest Neighbour Classification|74.28571428571428|67.61904761904762|69.04761904761905|67.61904761904762|66.66666666666666|
|(N)K Nearest Neighbour Classification|70.47619047619047|67.14285714285715|68.0952380952381|66.19047619047619|61.904761904761905|
|(N)Weighted K Nearest Neighbour Classification|70.47619047619047|68.0952380952381|68.57142857142858|67.61904761904762|65.23809523809523|

Analysing glass.csv 

In [55]:
df['Type'].value_counts()

2    76
1    69
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

Data is pretty unbalanced. We have a few samples of types 3, 5, 6. When we split data to %20 vs %80 possibility of this types very low. This creates harder decisions to make.

In [56]:
df.describe()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
count,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0,213.0
mean,1.518348,13.404085,2.679202,1.449484,72.65507,0.498873,8.954085,0.175869,0.057277,2.788732
std,0.003033,0.816662,1.443691,0.495925,0.773998,0.653185,1.425882,0.498245,0.097589,2.10513
min,1.51115,10.73,0.0,0.29,69.81,0.0,5.43,0.0,0.0,1.0
25%,1.51652,12.9,2.09,1.19,72.28,0.13,8.24,0.0,0.0,1.0
50%,1.51768,13.3,3.48,1.36,72.79,0.56,8.6,0.0,0.0,2.0
75%,1.51915,13.81,3.6,1.63,73.09,0.61,9.15,0.0,0.1,3.0
max,1.53393,17.38,4.49,3.5,75.41,6.21,16.19,3.15,0.51,7.0


Only Si has a different range from other attributes. Normalization doesn't effect the accuracy much. Sometimes performs worse. I think the reason is RI has a range 1.533930(max) - 1.511150(min) = *0.02278* and Fe has a range 0.510000(max) - 0.00000(min) = *0.510000*. We are increasing the ranges between 0-1(Older max - min value = 0.02278 now max - min value = 1).

In K Nearest Neighbour Classification when k increase, the accuracy lowers (Small accuracy increase on k=5). Also Weighted K Nearest Neighbour Classification performs same. But while k > 3 weighted knn gives us a better result standart knn. There is no best k value in knn. Finding the best knn is related to data not the knn algorithm we choose. Experiencing different k values on data then finding the sweet spot.  

Sample hard to classify 1.51316,13.02,0,3.04,70.48,6.21,6.96,0,0,5

Because Al, K and Ca attributes of this sample very different than other Type 5 glass attributes.

### Regression


| Style | k=1 | k=3| k=5 | k=7 | k=9 |
|----|----|----|----|----|----|
|K Nearest Neighbour Regression|6.636796019900497|6.711767827529023|7.13801592039801|7.396611229566455|7.548828081813154|
|Weighted K Nearest Neighbour Regression|6.739960199004974|5.8862062978753285|5.921253538162732|6.00127782282206|6.073872964133656|
|(N)K Nearest Neighbour Regression|7.062169154228856|6.9236086235489225|7.253603980099503|7.437124378109452|7.626424543946934|
|(N)Weighted K Nearest Neighbour Regression|7.165333333333334|6.231177956170017|6.346982611529331|6.389459195482027|6.487171646852168|

In [57]:
df2.describe()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
count,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0,1005.0
mean,278.631343,72.043483,55.536318,182.075323,6.033234,974.376816,772.688259,45.856716,35.250378
std,104.344261,86.170807,64.207969,21.339334,5.919967,77.579667,80.340435,63.734692,16.284815
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,190.7,0.0,0.0,166.6,0.0,932.0,724.3,7.0,23.52
50%,265.0,20.0,0.0,185.7,6.1,968.0,780.0,28.0,33.8
75%,349.0,142.5,118.3,192.9,10.0,1031.0,822.2,56.0,44.87
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


For k=1 K Nearest Neighbour Regression and Weighted K Nearest Neighbour Regression MAE's must be same but they are not. Because dataset has a samples with same feautures and different csMPa's. Dropping duplicate feautures will generates better results. But choosing which one to drop is an another question. 

Dataset has a different ranges but difference is not big. So applying normalization gives a similar results. Also the problem mentioned above creates unstability so we can't see the normalization actually works. 

In K Nearest Neighbour Regression when k increase, error also increase. For Weighted K Nearest Neighbour Regression when k =3 ,5 gives us a better results. Weighted knn for regression is generates better results than knn regression. 