In [2]:
################################################################
# Exploring Collaborative Filtering based on KNN
################################################################
# 1. Use User data with clicked items and user_portraits
# 2. train KNN algorithm
# 3. for a test observaion, find K nearest neighbors
# 4. find the most common items from the neighbors to recommend
# 4. Use cross-validation to calibrate K

In [3]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from DataPrep import getUserFeaturesTrainSet, getPurchasedItemsTrainSet, getUserFeaturesTestSet
TrainSet = getUserFeaturesTrainSet()
PurchasedItems = getPurchasedItemsTrainSet()

class KNNModel:
    def __init__(self, TrainData, purchaseData, K_neighbors):
        """
        train KNN model on TrainData
        purchaseData: list of length len(TrainData), each element is a list of purchased itemID
        K_neighbors: KNN parameter
        """
        self.model = NearestNeighbors(n_neighbors = K_neighbors)
        self.model.fit(TrainData)
        self.purchaseData = purchaseData
        self.K_neighbors = K_neighbors
    def predict(self, newPoint):
        """
        newPoint should have the same columns as TrainData, any number of row
        first find the nearest neighbors
        then count the frequency of their purchased items
        return: list with length = nrow of newPoint
            each element of list is a list of length 9
        """
        neighborDist, neighborIDs = self.model.kneighbors(newPoint)
        output = []
        # calculate score of purchased items with dictionary
        itemScore = {}
        for rowID in range(len(neighborIDs)):
            for i in range(self.K_neighbors):
                uID = neighborIDs[rowID][i]
                dist = neighborDist[rowID][i]
                if dist==0:
                    dist = 1e-7
                itemList = self.purchaseData[uID]
                for itemID in itemList:
                    if itemID not in itemScore.keys():
                        itemScore[itemID] = 1/dist
                    else:
                        itemScore[itemID] = itemScore[itemID] + 1/dist
            # find 9 items with highest frequency
            # first sort the dict by decreasing value
            sortedDict = {k: v for k, v in sorted(itemScore.items(), key=lambda item: item[1], reverse = True)}
            finalItems = list(sortedDict.keys())[:9]
            output.append(finalItems)
        return output
    
# main 
model = KNNModel(TrainSet, PurchasedItems, 50)
# get test set
userIDs, TestSet = getUserFeaturesTestSet()
# make prediction
recommendedItems = model.predict(TestSet)
# format data according to submission format and write to file
outFile = '/tf/shared/track2_output.csv'
f = open(outFile, "w")
f.write('id,itemids')
for i in range(len(userIDs)):
    f.write('\n')
    itemList = recommendedItems[i]
    itemString = ' '.join([str(j) for j in itemList])
    outString = str(userIDs[i]) + ',' + itemString
    f.write(outString)


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
