In [8]:
import h5py
import pandas as pd
import numpy as np
import json, time
from sklearn.metrics import mean_absolute_error as MAE

In [9]:
f = open('./epinions_subset/user_mapping.json')
user_mapping = json.loads(f.read())
f.close()
f = open('./epinions_subset/item_mapping.json')
item_mapping = json.loads(f.read())
f.close()

In [10]:
def five_fold_split(matrix):
    split_matrices = []
    valid_indices = matrix.nonzero()
    test_set_size = int(0.2 * len(valid_indices[0]))
    for i in range(5):
        start = int(i * 0.2 * len(valid_indices[0]))
        end = int((i+1) * 0.2 * len(valid_indices[0]))
        test_indices = (valid_indices[0][start:end], valid_indices[1][start:end])
        
        if start > 0 and end < len(valid_indices[0]):
            train_indices = (np.append(valid_indices[0][0:start], valid_indices[0][end:len(valid_indices[0])]),
                                       np.append(valid_indices[1][0:start], valid_indices[1][end:len(valid_indices[0])]) )
        elif start > 0:
            train_indices = (valid_indices[0][0:start], valid_indices[1][0:start])
        elif end < len(valid_indices[0]):
            train_indices = (valid_indices[0][end:len(valid_indices[0])], valid_indices[1][end:len(valid_indices[0])])
        
        train_matrix = np.zeros(matrix.shape)
        train_matrix[train_indices] = matrix[train_indices]
        test_matrix = np.zeros(matrix.shape)
        test_matrix[test_indices] = matrix[test_indices]
        split_matrices.append((train_matrix, test_matrix))
    count = 0
    for tup in split_matrices:
        print (len(valid_indices[0]), len(tup[0].nonzero()[0]) + len(tup[1].nonzero()[0]))
    return split_matrices

In [11]:
def train_item_based(user_by_item):
    print (user_by_item.shape)
    mask = np.ones(user_by_item.shape)
    mask[user_by_item.nonzero()] = 0
    masked_matrix = np.ma.masked_array(user_by_item, mask=mask)
    return np.ma.corrcoef(masked_matrix, rowvar=False).data

In [12]:
def train_user_based(user_by_item):
    item_by_user = user_by_item.T
    print (item_by_user.shape)
    mask = np.ones(item_by_user.shape)
    mask[item_by_user.nonzero()] = 0
    masked_matrix = np.ma.masked_array(item_by_user, mask=mask)
    return np.ma.corrcoef(masked_matrix, rowvar=False).data

In [13]:
user_item_matrix = np.loadtxt('./epinions_subset/user_item_matrix.txt')
split_matrices = five_fold_split(user_item_matrix)

68640 68640
68640 68640
68640 68640
68640 68640
68640 68640


In [7]:
for i in range(5):
    print (i)
    item_similarity = train_item_based(split_matrices[i][0])
    np.savetxt('./epinions_subset/item_similarity_%d.txt'%i, item_similarity)

0
(3000, 1000)
1
(3000, 1000)
2
(3000, 1000)
3
(3000, 1000)
4
(3000, 1000)


In [14]:
def nmae_item_item(test_matrix, train_matrix, similarity_matrix):
    indices = test_matrix.nonzero()
    y_true = []
    y_pred = []
    for i,j in zip(indices[0], indices[1]):
        y_true.append(test_matrix[i][j])
        item_sum = np.sum(similarity_matrix[j]) - 1
        if item_sum == 0:
            predicted_rating = np.mean(train_matrix[:,j][train_matrix[:,j].nonzero()])
        else:
            predicted_rating = np.mean(train_matrix[:,j][train_matrix[:,j].nonzero()]) + np.dot(similarity_matrix[j].flatten(), train_matrix[i].flatten())/item_sum
        y_pred.append(np.round(predicted_rating))
    return MAE(y_true, y_pred)/4

In [15]:
##prediction item item
for i in range(5):
    similarity = np.loadtxt('./epinions_subset/item_similarity_%d.txt'%i)
    print (nmae_item_item(split_matrices[i][1], split_matrices[i][0], similarity))

0.211137820513
0.218440413753
0.210099796037
0.213414189977
0.215417395105


In [26]:
for i in range(5):
    print (i)
    user_similarity = train_user_based(split_matrices[i][0])
    np.savetxt('./epinions_subset/user_similarity_%d.txt'%i, user_similarity)

0
(1000, 3000)


KeyboardInterrupt: 

In [32]:
np.seterr(all='raise')
def nmae_user_user(test_matrix, train_matrix, similarity_matrix):
    indices = test_matrix.nonzero()
    y_true = []
    y_pred = []
    for i,j in zip(indices[0], indices[1]):
        y_true.append(test_matrix[i][j])
        user_sum = np.sum(similarity_matrix[i]) - 1
        if user_sum == 0:
            predicted_rating = np.mean(train_matrix[i,:][train_matrix[i,:].nonzero()])
        elif len(train_matrix[i,:][train_matrix[i,:].nonzero()]) == 0:
            predicted_rating = np.mean(train_matrix[:,j][train_matrix[:,j].nonzero()])
        else:
            numerator = 0
            for k in range(test_matrix.shape[0]):
                if k != i and train_matrix[k][j] != 0:
                    if len(train_matrix[k,:][train_matrix[k,:].nonzero()]) == 0:
                        print (train_matrix[k,:][train_matrix[k,:].nonzero()])
                    u_a_mean = np.mean(train_matrix[k,:][train_matrix[k,:].nonzero()])
                    numerator += similarity_matrix[i][k] * (train_matrix[k][j] - u_a_mean)
            predicted_rating = np.mean(train_matrix[i,:][train_matrix[i,:].nonzero()]) + numerator/user_sum
        y_pred.append(np.round(predicted_rating))
    return MAE(y_true, y_pred)/4

In [33]:
##prediction user user
for i in range(5):
    similarity = np.loadtxt('./epinions_subset/user_similarity_%d.txt'%i)
    print (nmae_user_user(split_matrices[i][1], split_matrices[i][0], similarity))

0.211119609557
0.218895687646
0.20915282634
0.213395979021
0.215362762238


In [40]:
def nmae_trust(test_matrix, train_matrix, similarity_matrix, trust_matrix):
    similarity_matrix = 0.1*similarity_matrix + 0.9*trust_matrix
    indices = test_matrix.nonzero()
    y_true = []
    y_pred = []
    for i,j in zip(indices[0], indices[1]):
        y_true.append(test_matrix[i][j])
        user_sum = np.sum(similarity_matrix[i]) - 1
        if user_sum == 0:
            predicted_rating = np.mean(train_matrix[i,:][train_matrix[i,:].nonzero()])
        elif len(train_matrix[i,:][train_matrix[i,:].nonzero()]) == 0:
            predicted_rating = np.mean(train_matrix[:,j][train_matrix[:,j].nonzero()])
        else:
            numerator = 0
            for k in range(test_matrix.shape[0]):
                if k != i and train_matrix[k][j] != 0:
                    if len(train_matrix[k,:][train_matrix[k,:].nonzero()]) == 0:
                        print (train_matrix[k,:][train_matrix[k,:].nonzero()])
                    u_a_mean = np.mean(train_matrix[k,:][train_matrix[k,:].nonzero()])
                    numerator += similarity_matrix[i][k] * (train_matrix[k][j] - u_a_mean)
            predicted_rating = np.mean(train_matrix[i,:][train_matrix[i,:].nonzero()]) + numerator/user_sum
        y_pred.append(np.round(predicted_rating))
    return MAE(y_true, y_pred)/4

In [38]:
trustMat_E = None
trustMat_D = None
with h5py.File('trust_pairs.h5', 'r') as hf:
    trustMat_E = hf['E'][:]
    trustMat_D = hf['D'][:]

In [41]:
##prediction trust user
for i in range(5):
    similarity = np.loadtxt('./epinions_subset/user_similarity_%d.txt'%i)
    print (nmae_trust(split_matrices[i][1], split_matrices[i][0], similarity, trustMat_D))

0.211119609557
0.218895687646
0.20915282634
0.213395979021
0.215362762238


In [60]:
trust_embedding = np.loadtxt("./epinions_subset/epinions_embed_1", skiprows=1)
trust_embedding.sort(axis=0)
# padding = np.zeros((3000-trust_embedding.shape[0], trust_embedding.shape[1])) + 1
# trust_embedding = np.concatenate((trust_embedding,padding))[:,1:]
trust_embedding = trust_embedding[:,1:]
trust_embedding.shape

(3000, 128)

In [65]:
def train_embeddings(user_by_movie):
    similarity_matrix = np.zeros(user_by_movie.shape)
    print (user_by_movie.shape)

    count = 0
    for i in range(user_by_movie.shape[0]):
        user1_movies = np.array(user_by_movie[i].T).flatten()
        mean_1 = np.mean(user1_movies)
#         mean_1 = np.average(user1_movies[user1_movies.nonzero()])
        start = time.clock()

        for j in range(user_by_movie.shape[0]):
            if i != j:
                user2_movies = np.array(user_by_movie[j].T).flatten()
#                 bitmap1 = user1_movies.nonzero()[0]
#                 bitmap2 = user2_movies.nonzero()[0]
#                 if bitmap2.shape[0] == 0:
#                     continue
#                 if bitmap1.shape[0] > bitmap2.shape[0]:
#                     bitmap2 = np.concatenate( (bitmap2, np.array(np.zeros( (1, bitmap1.shape[0] - bitmap2.shape[0]) ) ).flatten() ), axis = 0)
#                 else:
#                     bitmap1 = np.concatenate( (bitmap1, np.array(np.zeros( (1, bitmap2.shape[0] - bitmap1.shape[0]) ) ).flatten() ), axis = 0)
#                 bitmap = np.intersect1d(np.array(bitmap1).flatten(), np.array(bitmap2).flatten())
#                 # print bitmap
#                 common_ratings_user1 = np.array(user1_movies[bitmap.astype(int)]).flatten()
#                 common_ratings_user2 = np.array(user2_movies[bitmap.astype(int)]).flatten()

#                 if len(bitmap) > 0:
#                     mean_2 = np.average(user2_movies[user2_movies.nonzero()])
                mean_2 = np.mean(user2_movies)
                try:
                    user1_movies -= mean_1
                    user2_movies -= mean_2
                    sq_1 = np.array(np.square(user1_movies)).flatten()
                    sq_2 = np.array(np.square(user2_movies)).flatten()
                    s_score = np.sum( user1_movies * user2_movies )/ np.sqrt( np.sum(sq_1) * np.sum(sq_2) )
                    similarity_matrix[i][j] = s_score
                except Exception as e:
                    similarity_matrix[i][j] = 0
#                 else:
#                     similarity_matrix[i][j] = 0

        print ('Time Taken: ' + str(time.clock() - start))

        count += 1
        # if count % 10 == 0:
        print ('\t%d/%d'%(count,user_max))
    return similarity_matrix
    
embedding_similarity = train_embeddings(trust_embedding)

(3000, 128)


NameError: name 'bitmap' is not defined