 Write a wrapper function for pcaCompare to find the best number of PCs for the optimal runtime and a loss no greater than maxLoss. Have your function call be pcaOptimize(X,Y,rows,k,pcs,maxLoss), where X, Y, rows, k, and pcs are the same as above, and maxLoss is the maximum acceptable average loss as a percentage of the loss without using pca (eg. maxLoss=1.1 then it can be 10% higher.)

In [None]:
# import drive 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
my_path = '/content/drive/My Drive/YearPredictionMSD.txt'
ms = pd.read_csv(my_path, header=None)

In [None]:
def findOverallLoss(ypreds,Y):
  err = (ypreds.reset_index(drop=True).T - Y.reset_index(drop=True).values.squeeze()) ** 2
  MAPE = err.mean(axis=1)
  return MAPE

def random_samples_generator(X, Y, rows):
  import random
  random_indexes = random.sample(range(0, len(Y)), rows)   # randomly select some indexes (sample b/c no repeats)
  newX = []
  newY = []
  for index in random_indexes:
    newX.append(X.iloc[index])    # return a matrix of the Xs at the random indexes
    newY.append(Y.iloc[index])    # return a list of the Ys at the random indexes 
  return pd.DataFrame(newX), (newY)

def pcs_to_compare(X,Y,rows,k,pcs):
  # X = train_set_X, Yrows = train_set_Y, K = maxK to test, rows = test set, psc = maximum principle components tested 
  from sklearn.neighbors import KNeighborsRegressor
  from sklearn.decomposition import PCA
  import statistics 
  import time

  PC_dict = {}
  newx, newy = random_samples_generator(X, Y, rows)  # randomly choose some 'rows' rows because the data is ordered 

  # generate the overall loss using all components (KNN with all pcs for comparison)
  knn = KNeighborsRegressor(n_neighbors=k)
  knn.fit(X, Y)
  preds = (knn.predict(newx).round())
  overallLoss_noPCA = findOverallLoss(pd.DataFrame(newy), pd.DataFrame(preds))

  for i in range(pcs):
    # get the prediction for each Y given a testset for 'i' pcs
    pca = PCA(n_components=(i+1))
    pca.fit(X)
    X_pca = pca.transform(X)
    newx_pca = pca.transform(newx)    
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_pca, Y)
    start_time = time.time()
    preds = (knn.predict(newx_pca).round())
    total_time = time.time() - start_time
    # get the overall loss for the predictions compared to the real Y values 
    overallLoss = findOverallLoss(pd.DataFrame(newy), pd.DataFrame(preds))
    Loss_as_percent =  overallLoss[0]/overallLoss_noPCA[0]
    PC_dict[i+1] = [Loss_as_percent, total_time]
  return(PC_dict)   # basically the same as the original function but returning a dictionary for easier searching 

In [None]:
def pcaOptimize(X,Y,rows,k,pcs,maxLoss):
  loss_percent = maxLoss - 1
  PCnum_differenceFromMean = pcs_to_compare(X,Y,rows,k,pcs)
  smaller_than_maxLoss = {}
  for row in PCnum_differenceFromMean:  # for each item in the dictionary
    loss = PCnum_differenceFromMean[row][0]   # get the loss
    if loss <= maxLoss:                     # if the loss is less than the specified maxLoss add to its own dictionary 
      smaller_than_maxLoss[PCnum_differenceFromMean[row][1]] = [PCnum_differenceFromMean[row][0], row]
      # this new dictionary is going to have the time be the key value (for easier sorting)
  least_time = sorted(smaller_than_maxLoss)[0]    # sort to find the smallest time value
  best_PC = smaller_than_maxLoss[least_time][1]   # return the number of PCs associated with this least time 
  return(best_PC)

In [None]:
X = ms.iloc[:,1:]
Y = ms.iloc[:,0]
rows = 10
k = 10 
pcs = 10
maxLoss = 1.01
compare = pcs_to_compare(X,Y,rows,k,pcs)
print('compare:', compare)
optimal = pcaOptimize(X,Y,rows,k,pcs,maxLoss)
print('optimal:',optimal)

compare: {1: [1.0632911392405062, 0.0011074542999267578], 2: [1.0928270042194093, 0.0011005401611328125], 3: [1.5, 0.00124359130859375], 4: [1.35126582278481, 0.0013890266418457031], 5: [1.0748945147679325, 0.0019865036010742188], 6: [1.0242616033755274, 0.002978086471557617], 7: [1.089662447257384, 0.004187345504760742], 8: [1.6782700421940928, 0.006232023239135742], 9: [1.1729957805907174, 0.008955001831054688], 10: [1.3892405063291138, 0.011487245559692383]}
optimal: 4
