## Statistical Learning Theory Lab: Recommender System

René Gesele 
168039
rene.gesele@uni-jena.de
02.08.2019

In [12]:
import numpy as np

# read train set and qualifying set
Xq = np.genfromtxt("data/data_qualifying_blanc.csv", delimiter=",", dtype=np.int)
Xt = np.genfromtxt("data/data_train.csv", delimiter=",", dtype=np.int)
print(Xq)

# predict ratings
mean = np.mean(Xt[:, 2])
Xq_mean = np.append(Xq, np.full((Xq.shape[0], 1), mean), axis=1)

# save in right format
np.savetxt("qualifying_mean.csv", Xq_mean,
           delimiter=",", newline="\n", encoding="utf-8")

[[3728  815]
 [1714  398]
 [1781  757]
 ...
 [  13 1188]
 [1899  432]
 [ 314  847]]


In [13]:
sparsematrix = np.zeros((max(Xt[:, 0]+1), max(Xt[:, 1])+1))
spcol = len(sparsematrix[0,:])
sprow = len(sparsematrix[:, 0])
print(spcol)
print(sprow)

2080
5499


In [14]:
for i in range(len(Xt[:, ])):
    user = Xt[i, 0]
    item = Xt[i, 1]
    rating = Xt[i, 2]
    sparsematrix[user, item] = rating+1

In [98]:
# euclidic norm of a vector
def euklidnorm(vec):
    tmp = 0
    for i in vec:
        tmp+=i**2
    return np.sqrt(tmp)

# Cosine-similarity
def cosim(user1, user2):
    return np.dot(user1, user2)/(euklidnorm(user1)*euklidnorm(user2))

# returns mean of given raitings by a user
def mean_of_given_ratings(user):
    sum_of_none_zero = 0
    count = 0
    for i in user:
        if i != 0:
            count+=1
            sum_of_none_zero += i
    return sum_of_none_zero/count
            
            
            
# Pearson-similarity
def pearson(user1, user2):
    user1_mean = mean_of_given_ratings(user1)
    user2_mean = mean_of_given_ratings(user2)
    set_of_items = []
    for i in range(len(user1)):
        if (user1[i]!=0) & (user2[i] != 0):
            set_of_items.append(i)
    
    sum1 = 0
    sum2 = 0
    sum3 = 0
    for i in set_of_items:
        sum1 += (user1[i] - user1_mean)*(user2[i] - user2_mean)
        sum2 += (user1[i] - user1_mean)**2
        sum3 += (user2[i] - user2_mean)**2
    return sum1/(np.sqrt(sum2)*np.sqrt(sum3))

# deletes rows and columns that are 0 only
# returns three matrices, one desparsed matrix and two matrices that safe indices of kept entries
def desparse(user_item_matrix):
    copy = user_item_matrix
    
    row_index = np.zeros((user_item_matrix.shape[0], user_item_matrix.shape[1]))
    column_index = row_index
    for i in range(row_index.shape[0]):
        row_index[i,:] = i
    for i in range(column_index.shape[1]):
        column_index[:,i] = i
    
    i = 0 
    j = 0
    br0 = copy.shape[0]
    br1 = copy.shape[1]
    
    while i < br0 : 
        if all(copy[i]==0):
            copy = np.delete(copy, i, 0)
            row_index = np.delete(row_index, i, 0)
            column_index = np.delete(column_index, i, 0)
            br0-=1
        else:
            
            i+=1
    print("halftime")
    while j < br1 :
        if all(copy[:,j]==0):
            copy = np.delete(copy, j, 1)
            row_index = np.delete(row_index, j, 1)
            column_index = np.delete(column_index, j, 1)
            br1-=1
            
        else:
            
            j+=1
    return copy, row_index, column_index


def user_distance_matrix(user_item_matrix, sim):
    user_dist = np.zeros((user_item_matrix.shape[0], user_item_matrix.shape[0]))
    for i in range(len(user_item_matrix)):
        for j in range(i, len(user_item_matrix)):
            
            if i == j:
                user_dist[i][j] = 0
            else:
                user_dist[i][j] = sim(user_item_matrix[i], user_item_matrix[j])
    return user_dist



def fill_zeros_with_mean(user_item_matrix):
    copy = user_item_matrix
    for i in range(copy.shape[0]):
        for j in range(copy.shape[1]):
            if copy[i,j] == 0:
                copy[i,j] = mean
            
    return copy

#macht aus der halben user_distance_matrix eine symmetrische um zeilenweise die Distanzen von usern extrahieren zu können
def make_symmetric_matrix(half_matrix):
    matrix = np.transpose(half_matrix)
    for i in range(matrix.shape[0]):
        for j in range(i, matrix.shape[0]):
            matrix[i][j] = matrix[j][i]
    return matrix;
                
#knn returns list of indices of the user_item_matrix
def knn(user_index,  user_distance_matrix, k):
    nn = []
    for i in range(user_distance_matrix.shape[0]):
        if user_index != i:
            tmp = user_distance_matrix[user_index][i]
            
            if len(nn)<k:
                nn.append((i, tmp))
                
            else:
                if tmp > nn[-1][1]:
                    nn[-1] = (i, tmp)
            nn = sorted(nn, reverse = True,  key = lambda el: el[1])
    return [x[0] for x in nn]



#returns indices of knn 
def matrix_of_knn(user_distance_matrix):
    matrix = []
    for i in range(user_distance_matrix.shape[0]):
        tmp = knn(i, user_distance_matrix, user_distance_matrix.shape[0])
        matrix.append(tmp)
    return matrix

#returns a user to each indices in list_of_indices 
def index_to_user(list_of_indices, user_item_matrix):
    copy = []
    for index in list_of_indices:
        copy.append(user_item_matrix[index])
    return copy

#calculate expected ratings for one user 

def predict_rating(user_index, list_of_indices, distance_matrix, user_item_matrix, k):
    copy = user_item_matrix
    user = user_item_matrix[user_index]
    
    r_hat = mean_of_given_ratings(user)
    for i in range(len(user)):
        if user_item_matrix[user_index][i]==0:
            sum_of_rating = 0
            sum_of_sim = 0
            tlist = []
            for j in list_of_indices:
                if (user_item_matrix[user_index][j]>0) & (len(tlist)<=k):
                    tlist.append(i)
            list_of_users = np.array(index_to_user(tlist, user_item_matrix))                                              
            for j in range(len(list_of_users)):
                simuv = distance_matrix[user_index][j]
                
                sum_of_rating += simuv*(list_of_users[j][i] - np.mean(list_of_users[j]))
                sum_of_sim += np.abs(simuv)
            user[i] = r_hat + sum_of_rating/sum_of_sim
    return copy



#calculates all rating for each user and item combination that is not given
def prediction_matrix(user_item_matrix, distance_matrix, matrix_of_knn_indices, k):
    copy = user_item_matrix
    for i in range(user_item_matrix.shape[0]):
        copy = predict_rating(i, matrix_of_knn_indices[i], distance_matrix, user_item_matrix, k)
    return copy


        
# inputs 0-rows and columns to get original shape      
def rebuild_user_item_matrix(user_item_matrix, sparsematrix, row_index, column_index):
    copy = sparsematrix
    for i in range(user_item_matrix.shape[0]):
        for j in range(user_item_matrix.shape[1]):
            copy[row_index[i][j]][column_index[i][j]] = user_item_matrix[i][j]
    return copy

# gets the prediction values for all user item combinations in test_data 
def get_predictions_for_test_data(user_item_matrix, test_data):
    tmp = []
    for i in range(len(test_data)):
        tmp.append(user_item_matrix[test_data[i][0]-1][test_data[i][1]-1])
           
    tmp = np.asarray(tmp)
    print(tmp.shape)
    
    test_data = np.append(test_data, np.reshape(tmp, (test_data.shape[0], 1)), axis = 1)
    for i in range(len(test_data)):
        if test_data[i][2]>= 0:
            test_data[i][2] = test_data[i][2]-1
        else: 
            test_data[i][2] = 0
    print(test_data.shape)
    return(test_data)


Die Methoden hier drüber hatten zunächst nicht nach den k nächsten Nachbar mit eingetragenem Rating gesucht, sonder immer die gleich top-k verwendet. 
Hier drunter ist der notdürftige versuch das zu fixen. (Rechenzeit geht durch die Decke)

In [93]:
# calculate knn under the condition that for an item the rating of the neighbours is not zero
def knn_none_zero(user_index,  user_distance_matrix, k, user_item_matrix, item):
    nn = []
    for i in range(user_distance_matrix.shape[0]):
        if (user_index != i) & (user_item_matrix[user_index][item]!=0):
            tmp = user_distance_matrix[user_index][i]
            
            if len(nn)<k:
                nn.append((i, tmp))
                
            else:
                if tmp > nn[-1][1]:
                    nn[-1] = (i, tmp)
            nn = sorted(nn, reverse = True,  key = lambda el: el[1])
    return [x[0] for x in nn]

#calculate an expected rating for one user with the condition that each neighbour rated the item
def predict_rating_zero(user_index, distance_matrix, user_item_matrix, k):
    copy = user_item_matrix
    user = user_item_matrix[user_index]
    r_hat = np.mean(user)
    for i in range(len(user)):
        list_of_users = index_to_user(knn_none_zero(user_index, distance_matrix, k,user_item_matrix, i), user_item_matrix)
        if user_item_matrix[user_index][i]==0:
            sum_of_rating = 0
            sum_of_sim = 0
            for j in range(len(list_of_users)):
                simuv = distance_matrix[user_index][j]
                
                sum_of_rating += simuv*(list_of_users[j][i] - np.mean(list_of_users[j]))
                sum_of_sim += np.abs(simuv)
            user[i] = r_hat + sum_of_rating/sum_of_sim
    return copy

# has the condition that neighbours rated the item
def prediction_matrix_zero(user_item_matrix, distance_matrix, k):
    copy = user_item_matrix
    for i in range(user_item_matrix.shape[0]):
        copy = predict_rating_zero(i, distance_matrix, user_item_matrix, k)
    return copy

In [94]:
user_item_matrix, deleted_rows, deleted_columns = desparse(sparsematrix)


halftime


In [95]:
#user_distances = user_distance_matrix(user_item_matrix, pearson)

In [100]:
user_mknn = matrix_of_knn(user_distances)
#np.savetxt("matrix_of_knn.csv", user_mknn, delimiter=",")

In [22]:
user_distances = make_symmetric_matrix(user_distances)
np.savetxt("user_item_distance.csv", user_distances, delimiter=",")

In [None]:
#user_distances = np.genfromtxt("user_item_distance.csv", delimiter=",", dtype=np.float)

In [99]:
Xq = np.genfromtxt("data/data_qualifying_blanc.csv", delimiter=",", dtype=np.int)
user_based = prediction_matrix(user_item_matrix, user_distances, user_mknn, 4)
rebuild_uim = rebuild_user_item_matrix(user_based, sparsematrix, deleted_rows.astype(int), deleted_columns.astype(int))
filled_rebuild = fill_zeros_with_mean(rebuild_uim)
Xq = get_predictions_for_test_data(filled_rebuild, Xq)
np.savetxt("qualifying_mean.csv", Xq, delimiter=",", newline="\n", encoding="utf-8")

IndexError: list index out of range

In [88]:
#Xq2 = np.genfromtxt("data/data_qualifying_blanc.csv", delimiter=",", dtype=np.int)
#user_based_prediction = prediction_matrix_zero(user_item_matrix, user_distances,5)
#rebuild_next_level = rebuild_user_item_matrix(user_based_prediction, sparsematrix, deleted_rows.astype(int), deleted_columns.astype(int))
#filled_rebuild2 = fill_zeros_with_mean(rebuild_next_level)
#Xq2 = get_predictions_for_test_data(filled_rebuild2, Xq2)
#np.savetxt("qualifying_mean_2.csv", Xq2, delimiter=",", newline="\n", encoding="utf-8")


In [89]:
#for i in range(len(Xq)):
#    print(Xq[i][2])

for i in range(len(Xq)):
    if (Xq[i][2] == 5.0):
        print("5.0: ", i)
    if (Xq[i][2] == None):
        print("nan: ", i )

5.0:  651
5.0:  1407
5.0:  1481
5.0:  2092
5.0:  2811
5.0:  3173
5.0:  3430
5.0:  3762
5.0:  5025
5.0:  5150
5.0:  5482
5.0:  6105
5.0:  7072
5.0:  7563
5.0:  7885
5.0:  8824
5.0:  9021
5.0:  9655
5.0:  9914
5.0:  10141
5.0:  10254
5.0:  11981
5.0:  12053
5.0:  12199
5.0:  12421
5.0:  12546
5.0:  15206
5.0:  15357
5.0:  15478
5.0:  15941
5.0:  16667
5.0:  16738
5.0:  16908
5.0:  17801
5.0:  18004
5.0:  19072
5.0:  20182
5.0:  20208
5.0:  21121
5.0:  21147
5.0:  21773
5.0:  22279
5.0:  22793
5.0:  23200
5.0:  23870
5.0:  24956
5.0:  25251
5.0:  26533
5.0:  27116
5.0:  28237
5.0:  28824
5.0:  28847
5.0:  29245
5.0:  29973
5.0:  32106
5.0:  32115
5.0:  32266
5.0:  33289
5.0:  34109
5.0:  34124
5.0:  34249
5.0:  34365
5.0:  35089
5.0:  35253
5.0:  35294
5.0:  35402
5.0:  35608
5.0:  36325
5.0:  36390
5.0:  36821
5.0:  36911
5.0:  37276
5.0:  37626
5.0:  37630
5.0:  37826
5.0:  38061
5.0:  38239
5.0:  38290
5.0:  38885
5.0:  39276
5.0:  39918
5.0:  39990
5.0:  40289
5.0:  40541
5.0:  41881


In [90]:
np.savetxt("qualifying_mean.csv", Xq, delimiter=",", newline="\n", encoding="utf-8")

In [91]:
print(np.mean(Xq[:,2]))

0.5976466901476328
