In [20]:
import numpy as np
import pickle
from tqdm import tqdm_notebook
import time
import random
import sys

In [85]:
def corrected_mean(matrix):
    
    if not np.isclose(np.count_nonzero(matrix), 0):
        return np.sum(matrix)/np.count_nonzero(matrix)
    else:
        return 2.5

In [106]:
def normalize_ratings(ratingsMatrix):
    
    normalizedRatingsMatrix = ratingsMatrix.copy()
    
    for i, userRatings in enumerate(ratingsMatrix):
        
        if np.count_nonzero(userRatings) == 0:
            residue = corrected_mean(ratingsMatrix)
        else:
            residue = np.sum(userRatings)/np.count_nonzero(userRatings)
        
        
        for j,rating in enumerate(userRatings):
            if rating != 0:
                normalizedRatingsMatrix[i][j] -= residue
            if normalizedRatingsMatrix[i][j] == 0:
                normalizedRatingsMatrix[i][j] = 1e-8
        
    return normalizedRatingsMatrix

In [87]:
def pearson_coeff(x, y):
    
    return x

In [150]:
def cosine_dist(x, y):
    
    if np.isclose(np.dot(x,y), 0):
        return 0.0
    
    return np.dot(x, y)/(np.linalg.norm(x)*np.linalg.norm(y))

In [154]:
def get_similar_users(ratingsMatrix, userIdx, k):
    
    similarUsers = []
    thisUser = ratingsMatrix[userIdx].copy()
    
    for i, userRatings in (enumerate(ratingsMatrix)):
        
        if i == userIdx:
            continue
        
        # similarity = pearson_coeff(userRatings, thisUser)
        similarity = cosine_dist(userRatings, thisUser)
        
        similarUsers.append((i, similarity))
        
    similarUsers = sorted(similarUsers, key = lambda x: x[1], reverse = True)
    
    return similarUsers[:k]

In [165]:
def predict(ratingsMatrix, normalizedRatingsMatrix, userIdx, itemIdx, baseline = False):
    
    k = 10
    similarUsers = get_similar_users(normalizedRatingsMatrix, userIdx, k)
    
    predictedRating = 0.0
    normalizationFactor = 0.0
    mu = corrected_mean(ratingsMatrix)
    cnt = 0
    b = 0.0
    
    for (user, similarity) in similarUsers:
        
        if ratingsMatrix[user][itemIdx] > 0 and similarity > 0: 
            
            # check with and without similarity > 0
            
            # user's rating should be considered only if they have rated this item
            # otherwise the missing value (if considered as zero) may cause problems
            # it can be imputed by taking the user's mean rating or item's mean rating, etc.
                
            if baseline == True:
                b = corrected_mean(ratingsMatrix[user,:])
            
            predictedRating += (ratingsMatrix[user][itemIdx] - b) * similarity
            normalizationFactor += similarity

    if not np.isclose(normalizationFactor, 0):
        predictedRating /= normalizationFactor
    else:
        return max(1, min(corrected_mean(ratingsMatrix[userIdx,:]) + corrected_mean(ratingsMatrix[:,itemIdx]) - mu, 5.0))
    
    if baseline == True:
        b = corrected_mean(ratingsMatrix[userIdx,:])
        predictedRating += b
    
    return max(1, min(predictedRating, 5.0))

In [112]:
def train_test_split(ratingsMatrix, cnt = 1000):
    
    (rows, cols) = ratingsMatrix.shape
    
    completeSet = []
    for i in (range(rows)):
        for j in range(cols):
            if ratingsMatrix[i][j] != 0:
                completeSet.append((i,j))
    random.shuffle(completeSet)
    
    testSet = random.sample(completeSet, cnt)
    trainSet = list(set(completeSet) - set(testSet))
    
    return trainSet, testSet

In [168]:
def evaluate(ratingsMatrix, normalizedRatingsMatrix, testSet, originalMatrix):
    
    N = len(testSet)
    rmse = 0
    mae = 0
    rmseB = 0
    maeB = 0
    
    for i in tqdm_notebook(range(N)):
        
        predicted = predict(ratingsMatrix, normalizedRatingsMatrix, testSet[i][0], testSet[i][1])
        baselinePredicted = predict(ratingsMatrix, normalizedRatingsMatrix, testSet[i][0], testSet[i][1], True)
        actualValue = originalMatrix[testSet[i][0]][testSet[i][1]]
        
        print(i)
        print('Predicted: ', predicted)
        print('Predicted (with baseline): ',baselinePredicted)
        print('Actual: ', actualValue)
        print()
        
        bError = abs(actualValue - baselinePredicted)
        error = abs(actualValue - predicted)
        
        rmse += error**2
        rmseB += bError**2
        mae += error
        maeB += bError
        
    rmse /= N
    rmseB /= N
    mae /= N
    maeB /= N
    
    return mae, maeB, rmse, rmseB

In [175]:
# ratingsMatrix = np.array([[4,0,0,5,1,0,0],[5,5,4,0,0,0,0],[0,0,0,2,4,5,0],[0,3,0,0,0,0,3]], dtype = float)
# ratingsMatrix = [[4, 0, 0, 5],[5, 5, 4, 0],[0, 0, 0, 2],[0, 3, 0, 0],[1, 2, 0, 1],[0, 4, 3, 1]]
# textMatrix = np.array([[4,0,0,5,1,0,0],[5,5,4,0,0,0,0],[0,0,0,2,4,5,0],[0,3,0,0,0,0,3]], dtype = float)
# normalized = normalize_ratings(textMatrix)

print(predict(textMatrix, normalized, 0, 0, True))

random.seed(1)

with open('ratingsMatrix_noZeros.pickle', 'rb') as file:
    ratingsMatrix = pickle.load(file)
ratingsMatrix = np.array(ratingsMatrix, dtype = float)

originalMatrix = ratingsMatrix.copy()
trainSet, testSet = train_test_split(originalMatrix, 1000) #10s

# < 1s
for index in testSet:
    ratingsMatrix[index[0]][index[1]] = 0.0

normalizedRatingsMatrix = normalize_ratings(ratingsMatrix)   # 18s

In [176]:
print(evaluate(ratingsMatrix, normalizedRatingsMatrix, testSet, originalMatrix))

HBox(children=(IntProgress(value=0), HTML(value='')))

0
Predicted:  4.111340998967339
Predicted (with baseline):  4.111340998967339
Actual:  4.0

1
Predicted:  3.686897730265939
Predicted (with baseline):  3.7844999532880115
Actual:  5.0

2
Predicted:  3.0
Predicted (with baseline):  3.1845694799658997
Actual:  3.0

3
Predicted:  5.0
Predicted (with baseline):  4.7766794063974105
Actual:  5.0

4
Predicted:  2.3708572471560494
Predicted (with baseline):  2.106190549033097
Actual:  4.0

5
Predicted:  2.0660584377255256
Predicted (with baseline):  2.0660584377255256
Actual:  2.0

6
Predicted:  3.002072395997649
Predicted (with baseline):  2.983394662376536
Actual:  2.0

7
Predicted:  4.599320234541106
Predicted (with baseline):  4.179356233198271
Actual:  2.0

8
Predicted:  3.552650653805472
Predicted (with baseline):  3.862791661522972
Actual:  4.0

9
Predicted:  2.208540879028093
Predicted (with baseline):  2.5288198552823093
Actual:  2.0

10
Predicted:  3.6119784760539506
Predicted (with baseline):  4.108333950645369
Actual:  2.0

11
Pred

91
Predicted:  3.6297481765767374
Predicted (with baseline):  4.564432285020802
Actual:  5.0

92
Predicted:  4.0
Predicted (with baseline):  4.263669530594505
Actual:  3.0

93
Predicted:  2.7327035216780775
Predicted (with baseline):  2.8349240412986654
Actual:  4.0

94
Predicted:  2.0006672434639046
Predicted (with baseline):  3.05084582764571
Actual:  4.0

95
Predicted:  3.6422063546877426
Predicted (with baseline):  3.6422063546877426
Actual:  4.0

96
Predicted:  3.3419397684183783
Predicted (with baseline):  3.547884001225614
Actual:  4.0

97
Predicted:  2.238280971626766
Predicted (with baseline):  2.5943240236649054
Actual:  2.0

98
Predicted:  2.9999999999999996
Predicted (with baseline):  3.003383462082334
Actual:  3.0

99
Predicted:  3.7380480440184134
Predicted (with baseline):  4.498217786460082
Actual:  5.0

(0.7332229591390438, 0.653945381833294, 0.8997287069437572, 0.7658410654834132)
