-
Notifications
You must be signed in to change notification settings - Fork 0
/
m_KNNAlgorithm.py
85 lines (64 loc) · 2.98 KB
/
m_KNNAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from surprise import AlgoBase
from surprise import PredictionImpossible
from movies import MoviesContent
import numpy as np
import math
import heapq
#Data KNN Algorithm.
class DataKNN(AlgoBase):
def __init__(self, k=40):
AlgoBase.__init__(self)
self.k = k
# Compute item similarity matrix based on content attributes
def fit(self, trainset):
AlgoBase.fit(self, trainset)
# Compute item similarity matrix based on content attributes
# Load up genre vectors for every movie
movies = MoviesContent(False, False)
genres = movies.getGenres()
print("Computing content-based similarity matrix...")
# Compute genre distance for every movie combination as a 2x2 matrix
self.similarities = np.zeros((self.trainset.n_items, self.trainset.n_items))
for thisRating in range(self.trainset.n_items):
if thisRating % 100 == 0:
print(thisRating, " of ", self.trainset.n_items)
for otherRating in range(thisRating + 1, self.trainset.n_items):
thisMovieID = int(self.trainset.to_raw_iid(thisRating))
otherMovieID = int(self.trainset.to_raw_iid(otherRating))
if len(genres[thisMovieID]) > 0 and len(genres[otherMovieID]) > 0:
genreSimilarity = self.computeGenreSimilarity(thisMovieID, otherMovieID, genres)
self.similarities[thisRating, otherRating] = genreSimilarity
self.similarities[otherRating, thisRating] = self.similarities[thisRating, otherRating]
print("...done.")
return self
def computeGenreSimilarity(self, movie1, movie2, genres):
genres1 = genres[movie1]
genres2 = genres[movie2]
sumxx, sumxy, sumyy = 0, 0, 0
for i in range(len(genres1)):
x = genres1[i]
y = genres2[i]
sumxx += x * x
sumyy += y * y
sumxy += x * y
return sumxy / math.sqrt(sumxx * sumyy)
def estimate(self, u, i):
if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
raise PredictionImpossible('User and/or item is unkown.')
# Build up similarity scores between this item and everything the user rated
neighbors = []
for rating in self.trainset.ur[u]:
genreSimilarity = self.similarities[i, rating[0]]
neighbors.append((genreSimilarity, rating[1]))
# Extract the top-K most-similar ratings
k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[0])
# Compute average sim score of K neighbors weighted by user ratings
simTotal = weightedSum = 0
for (simScore, rating) in k_neighbors:
if (simScore > 0):
simTotal += simScore
weightedSum += simScore * rating
if (simTotal == 0):
raise PredictionImpossible('No neighbors')
predictedRating = weightedSum / simTotal
return predictedRating