In [29]:
import numpy as np
import matplotlib.pyplot as plt

from data_utils import load_dataset
x_train, x_valid, x_test, y_train, y_valid, y_test = load_dataset('mauna_loa')     #choose data set to load in

x_train = np.vstack([x_valid, x_train])
y_train = np.vstack([y_valid, y_train])
np.random.seed(0)

def dist(xtest, xtrain, l):        #Take in test point and compare to training set of points.
    if l==1:
           return np.sqrt(np.sum(np.square(xtrain-xtest), axis=1))
    elif l==2:
        return (np.sum(abs(xtrain-xtest)**(3), axis=1))**(1/3)
    else:
        return "ERROR"
    
def knn_reg(xtest,xtrain,ytrain,k,l):      #perfomr KNN with x_test to produce y_test predictions
    ytout=[]
    for i in range(0,len(xtest)):   
        dis=dist(xtest[i],xtrain,l)
        nn=np.argpartition(dis, kth=k)[:k]
        yt=0
        for i in range(0,len(nn)):
            yt = yt + ytrain[nn[i]]
        yt = yt/len(nn)
        ytout=ytout+[yt]
    return ytout

def knn_5fold(xtrain,ytrain):      #perform 5 fold validation on training set to estimate k vale and distance metric (L1 or L2)
    ko=[]
    ro=[]
    lo=[]
    np.random.seed(0)
    index=np.array(range(0,len(x_train)))
    RanInd=np.random.permutation(index)
    splitpoint=int(len(RanInd)/5)
    for l in range(2,3):
        for k in range(0,5):
            dlv=index[splitpoint*k:splitpoint*(k+1)]
            cvs=np.delete(RanInd, dlv)
            cvst=RanInd[dlv]
            xtr=xtrain[cvs[:len(cvs)]]
            ytr=ytrain[cvs[:len(cvs)]]
            xts=xtrain[cvs[:len(cvst)]]
            yts=xtrain[cvs[:len(cvst)]]
            for i in range(1,int(np.sqrt(len(xts)))):
                knno=knn_reg(xts,xtr,ytr,i,l)
                rmseo=0
                for j in range(0,len(knno)):
                    rmseo = rmseo + (knno[j]-yts[j])**2
                rmseo=(rmseo/len(knno))*(1/2)
                ro=ro+[rmseo]
                ko=ko+[i]
                lo=lo+[l]
    ind=np.argmin(ro)            #use if data is singular (Eg. Mauna_loa)
    #roa=ro.copy()                             #use if data is in lists with multiple entries (Eg. Rosenbrock, Pumadyn32nm)
    #for i in range(0,len(ro)):                #use if data is in lists with multiple entries (Eg. Rosenbrock, Pumadyn32nm)
    #    for j in range(0,len(ro[0])):         #use if data is in lists with multiple entries (Eg. Rosenbrock, Pumadyn32nm)
    #        roa[i]=(ro[i][j]+ro[i][j])/2      #use if data is in lists with multiple entries (Eg. Rosenbrock, Pumadyn32nm)
    #ind=int(np.argmin(roa)/len(ro[0]))        #use if data is in lists with multiple entries (Eg. Rosenbrock, Pumadyn32nm)
    print("k=", ko[ind])
    print("RMSE=", ro[ind])
    print("lo=", lo[ind])
    return 0
    
def knn_err(xtest,ytest,xtrain,ytrain,k,l):     #perform KNN and compare predicted ytest values to actual y_test values
    knno=knn_reg(xtest,xtrain,ytrain,k,l)
    rmseo=0
    for j in range(0,len(knno)):
        rmseo = rmseo + (knno[j]-ytest[j])**2
    rmseo=(rmseo/len(knno))*(1/2)
    return rmseo                                #return RMSE between predicted and actual y values