In [1]:
import numpy as np
import pandas as pd

In [2]:
from google.colab import files
data = files.upload()

Saving Housing.csv to Housing (1).csv


In [3]:
col_names = ['price', 'area', 'bedrooms', 'bathrooms', 'stories', 'mainroad', 'guestrooms', 'basement', 'hotwaterheating', 'airconditioning', 'parking', 'prefarea', 'furnishingstatus']
dataset = pd.read_csv(r"/content/Housing.csv", skiprows=1, header=None, names=col_names)
dataset.head(10)

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestrooms,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished
5,10850000,7500,3,3,1,yes,no,yes,no,yes,2,yes,semi-furnished
6,10150000,8580,4,3,4,yes,no,no,no,yes,2,yes,semi-furnished
7,10150000,16200,5,3,2,yes,no,no,no,no,0,no,unfurnished
8,9870000,8100,4,1,2,yes,yes,yes,no,yes,2,yes,furnished
9,9800000,5750,3,2,4,yes,yes,no,no,yes,1,yes,unfurnished


In [4]:
dataset.replace({'mainroad': {'yes': 1, 'no': 0},
                 'guestrooms': {'yes': 1, 'no': 0},
                 'basement': {'yes': 1, 'no': 0},
                 'hotwaterheating': {'yes': 1, 'no': 0},
                 'airconditioning': {'yes': 1, 'no': 0},
                 'prefarea': {'yes': 1, 'no': 0},
                 'furnishingstatus' : {'unfurnished': 0, 'semi-furnished': 1, 'furnished': 3}}, inplace=True)


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(dataset)
dataset = pd.DataFrame(dataset)

In [6]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.0,0.396564,0.6,0.333333,0.666667,1.0,0.0,0.0,0.0,1.0,0.666667,1.0,1.0
1,0.909091,0.502405,0.6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
2,0.909091,0.571134,0.4,0.333333,0.333333,1.0,0.0,1.0,0.0,0.0,0.666667,1.0,0.333333
3,0.906061,0.402062,0.6,0.333333,0.333333,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0
4,0.836364,0.396564,0.6,0.0,0.333333,1.0,1.0,1.0,0.0,1.0,0.666667,0.0,1.0


In [7]:
#Seperating the input features and output labels
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 12].values
#converting text labels to numeric form
labels, unique = pd.factorize(y)

In [8]:
#splitting data in test and train segments
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size = 0.40)

In [9]:
def KNNClassify(X_test, Y_train = y_train,X_train = X_train, k = 10):

    min_dist = []
    #for every example in the training set, calculate eucledien distance against the test example
    for i,point in enumerate(X_train):
        d1 = (point[0]-X_test[0])**2
        d2 = (point[1]-X_test[1])**2
        d3 = (point[2]-X_test[2])**2
        d4 = (point[3]-X_test[3])**2
        dist = np.sqrt(d1+d2+d3+d4)
        #append the calculated distance in a list
        min_dist.append((i,dist))
    #sort distances in ascending order
    min_dist.sort(key = takeSecond)

    #get top k nearest neighbours
    neighbours = min_dist[:k]
    #get index of the minimum distances
    idx = []
    for tup in neighbours:
        idx.append(tup[0])
    #check which label has majority
    output = Y_train[idx]
    values, counts = np.unique(output, return_counts=True)
    #return label with majority occurence
    max_idx = np.argmax(counts)
    return values[max_idx]

#Creating a helper function
def takeSecond(elem):
   return elem[1]

In [10]:
#getting predicted values using our algorithm
predictions = list(map(KNNClassify, X_test))

In [11]:
def accuracy(pred , y_test):
    count = 0
    for i in range(len(pred)):
        if pred[i] == y_test[i]:
            count +=1

    return print("Accuracy =", (count/len(pred))*100, "%")

def precision(pred, y_test):
    true_positives = sum(1 for pred_val, true_val in zip(pred, y_test) if pred_val == 1 and true_val == 1)
    predicted_positives = sum(pred)
    return true_positives / predicted_positives if predicted_positives != 0 else 0

def recall(pred, y_test):
    true_positives = sum(1 for pred_val, true_val in zip(pred, y_test) if pred_val == 1 and true_val == 1)
    actual_positives = sum(1 for true_val in y_test if true_val == 1)
    return true_positives / actual_positives if actual_positives != 0 else 0

def f1_score(pred, y_test):
    prec = precision(pred, y_test)
    rec = recall(pred, y_test)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) != 0 else 0


In [12]:
#calling the accuracy function
accuracy(predictions, y_test)

prec = precision(predictions, y_test)
rec = recall(predictions, y_test)
f1 = f1_score(predictions, y_test)

print("Precision =", prec)
print("Recall =", rec)
print("F1-score =", f1)

Accuracy = 43.57798165137615 %
Precision = 0.23949579831932774
Recall = 0.6333333333333333
F1-score = 0.3475609756097561
