In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from math import sqrt

def RSS(y_test, y_pred):
    temp = np.zeros(y_pred.size, dtype = np.float64)
    for i in range(temp.size):
        temp[i] = (y_test[i] - y_pred[i])*(y_test[i] - y_pred[i])
    sum = 0.0
    for i in range(temp.size):
        sum += temp[i]
    return sum

def rse(y_test, y_pred):
    rss = RSS(y_test, y_pred)
    n = y_test.size
    p = len(features)
    return sqrt(rss / (n-p-1.0))

def r_squared(y_test, y_pred):
    rss = RSS(y_test, y_pred)
    mean = np.mean(y_test)

    #TSS
    temp = np.zeros(y_test.size, dtype = np.float64)
    for i in range(temp.size):
        temp[i] = (y_test[i] - mean)*(y_test[i] - mean)

    #TSS sum
    sum = 0.0
    for i in range(temp.size):
        sum += temp[i]

    return 1 - (rss/sum)

def euclidean_distance(x1, x2):
    return sqrt(np.sum((x1 - x2)**2))

def get_neighbors(X_train, y_train, x_test, k):
    euc = [0] * X_train.shape[0]
    indexes = [0] * k
    results = [0] * k

    for i in range(X_train.shape[0]):
        euc[i] = euclidean_distance(x_test, X_train[i])

    for i in range(k):
        indexes[i] = euc.index(min(euc))
        euc[indexes[i]] += 100

    for i in range(k):
        results[i] = y_train[i]
        
    return results

def predict(X_train, y_train, x_test, k):
    ans = get_neighbors(X_train, y_train, x_test, k)
    return sum(ans) / k

def evaluate(X_train, y_train, X_test, y_test, k):
    pred = np.zeros(X_test.shape[0], dtype = np.int64)
    for i in range (X_test.shape[0]):
        pred[i] = predict(X_train, y_train, X_test[i], k)
    return pred

In [2]:
df = pd.read_csv('./data/winequality-white.csv', delimiter = ';')
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol']

X = df[features].values
Y = df['quality'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=0.3,
                                                    random_state=43,
                                                    shuffle=True)

In [3]:
pred = evaluate(X_train, y_train, X_test, y_test, 5)

print(f'rse:  {rse(y_test, pred):.4f}')
print(f'rsq: {r_squared(y_test, pred):.4f}')

num_correct_predictions = (pred == y_test).sum()
accuracy = num_correct_predictions / y_test.shape[0]
print(f'acc:  {accuracy*100:.2f}%')

rse:  0.9034
rsq: -0.0127
acc:  45.37%
