In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt

In [3]:
#Statistics Functions

#Residual Sum of Squares
def RSS(y_test, y_pred):
    return np.sum((y_test - y_pred) ** 2)

#Residual Standard Error
def rse(y_test, y_pred, numFeatures):
    rss = RSS(y_test, y_pred)
    n = y_test.size
    p = numFeatures
    return sqrt(rss / (n-p-1.0))

#R-Squared
def r_squared(y_test, y_pred):
    rss = RSS(y_test, y_pred)

    #Total Sum of Squares 
    tss = np.sum((y_test - np.mean(y_test)) ** 2)

    return 1 - (rss/tss)

In [4]:
#K-Nearest-Neighbor Regression Model

#Uses Euclidean Distance metric
def euclidean_distance(x1, x2):
    return sqrt( np.sum((x1 - x2) ** 2) )

#Finds k nearest neighbors based on distance
def get_neighbors(X_train, y_train, x_test, k):
    euc = [0] * X_train.shape[0]
    for i in range(X_train.shape[0]):
        euc[i] = euclidean_distance(x_test, X_train[i])

    sorted_indexes = np.argsort(euc)
    nearest_indexes = sorted_indexes[:k]

    return np.array([y_train[i] for i in nearest_indexes])

#Returns predicted value based off neighbors found
def predict(X_train, y_train, x_test, k):
    ans = get_neighbors(X_train, y_train, x_test, k)
    return np.mean(ans)

#builds numpy array of predictions based off input and returns
def evaluate(X_train, y_train, X_test, k):
    pred = np.zeros(X_test.shape[0], dtype = np.float64)
    for i in range (X_test.shape[0]):
        pred[i] = predict(X_train, y_train, X_test[i], k)
    return pred

In [6]:
#reads wine dataset
df = pd.read_csv('./data/winequality-white.csv', delimiter = ';')

#features from the white wine dataset
features = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol']

#Split training and testing data from the dataset
X = df[features].values
Y = df['quality'].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, 
                                                    test_size=0.3,
                                                    random_state=43,
                                                    shuffle=True)

#fit model and predicts values, k = 5
#aka 5 nearest neighbors used to predict
pred = evaluate(X_train, y_train, X_test, 5)

#accuracy calculations
num_correct_predictions = (pred == y_test).sum()
accuracy = num_correct_predictions / y_test.shape[0]

#printing statistics
print( f'residual standard error: { rse(y_test, pred, len(features)) :.4f}' )
print( f'r squared: { r_squared(y_test, pred) :.4f}' )
print( f'accuracy:  {accuracy * 100 :.2f}%' )

residual standard error: 0.8302
r squared: 0.1449
accuracy:  11.36%
