### Imports

In [1]:
from collections import defaultdict
import random
import numpy as np   
import scipy.linalg

### Read in data

In [2]:
def read_txt(filename, start=0, end=100000000):
  
    u_i = defaultdict(set)
    i_u = defaultdict(set)
    labels = defaultdict(int)

    with open(filename, 'r') as f:
        count = 0
        for l in f:
            if count < start or count >= end:
                count += 1
                continue

            l = l.strip("\n").split()
            u = l[0]
            i = l[1]
            r = l[2]
            u_i[u].add((i,r))
            i_u[i].add((u,r))
            labels[(u,i)] = r
            count += 1

    return u_i, i_u, labels

#read in all data
b_u_i, b_i_u, b_labels = read_txt('books.txt')
e_u_i, e_i_u, e_labels = read_txt('electronics.txt')
m_u_i, m_i_u, m_labels = read_txt('movies.txt')

# get users and items list for all 3 datasets 
b_users = list(b_u_i.keys())
b_items = list(b_i_u.keys())

e_users = list(e_u_i.keys())
e_items = list(e_i_u.keys())

m_users = list(m_u_i.keys())
m_items = list(m_i_u.keys())

# randomly shuffle users and items 
random.shuffle(b_users)
random.shuffle(b_items)
random.shuffle(e_users)
random.shuffle(e_items)
random.shuffle(m_users)
random.shuffle(m_items)

# sort users and items by most popular 
b_u_counts = [(len(b_u_i[u]), str(u)) for u in b_users]
b_u_counts.sort(reverse=True)

b_i_counts = [(len(b_i_u[i]), i) for i in b_items]
b_i_counts.sort(reverse=True)

e_u_counts = [(len(e_u_i[u]), u) for u in e_users]
e_u_counts.sort(reverse=True)

e_i_counts = [(len(e_i_u[i]), i) for i in e_items]
e_i_counts.sort(reverse=True)

m_u_counts = [(len(m_u_i[u]), u) for u in m_users]
m_u_counts.sort(reverse=True)

m_i_counts = [(len(m_i_u[i]), i) for i in m_items]
m_i_counts.sort(reverse=True)

### Create training data 

In [3]:
M = 500
N = 1000

# select 500 most popular users and 1000 most popular items 
b_u_train = [u[1] for u in b_u_counts[:M]]
b_i_train = [i[1] for i in b_i_counts[:N]]
e_u_train = [u[1] for u in e_u_counts[:M]]
e_i_train = [i[1] for i in e_i_counts[:N]]
m_u_train = [u[1] for u in m_u_counts[:M]]
m_i_train = [i[1] for i in m_i_counts[:N]]

### Ratings matrix & Train/test split 

In [4]:
# create 0 filled ratings matrices 
D_b = np.zeros((M,N))
D_e = np.zeros((M,N))
D_m = np.zeros((M,N))

b_pairs = []
e_pairs = []
m_pairs = []

# create user,item rating pairs and populate ratings matrices
for a,u in enumerate(b_u_train): 
    for b,i in enumerate(b_i_train): 
        if (u,i) in b_labels: 
            b_pairs.append((u,i, a, b, float(b_labels[(u,i)])))
        D_b[a,b] = float(b_labels[(u,i)])
        
for a,u in enumerate(e_u_train): 
    for b,i in enumerate(e_i_train): 
        if (u,i) in e_labels: 
            e_pairs.append((u,i, a, b, float(e_labels[(u,i)])))
        D_e[a,b] = float(e_labels[(u,i)])
        
for a,u in enumerate(m_u_train): 
    for b,i in enumerate(m_i_train):
        if (u,i) in m_labels: 
            m_pairs.append((u,i, a, b, float(m_labels[(u,i)])))
        D_m[a,b] = float(m_labels[(u,i)])
        
print("total number of book (user,item) pairs = %d" % (len(b_pairs)))
print("total number of electronics (user,item) pairs = %d" % (len(e_pairs)))
print("total number of movies (user,item) pairs = %d" % (len(m_pairs)))

# randomly shuffle user,item rating pairs 
random.shuffle(b_pairs)
random.shuffle(e_pairs)
random.shuffle(m_pairs)

# split rating pairs into train/test 
b_train_pairs = b_pairs[:int(len(b_pairs)*0.8)]
b_test_pairs = b_pairs[int(len(b_pairs)*0.8):]

e_train_pairs = e_pairs[:int(len(e_pairs)*0.8)]
e_test_pairs = e_pairs[int(len(e_pairs)*0.8):]

m_train_pairs = m_pairs[:int(len(m_pairs)*0.8)]
m_test_pairs = m_pairs[int(len(m_pairs)*0.8):]


# remove test pair ratings from training rating matrix 
for b in b_test_pairs:  
    D_b[b[2],b[3]] = 0
for e in e_test_pairs: 
    D_e[e[2],e[3]] = 0
for m in m_test_pairs: 
    D_m[m[2],m[3]] = 0

total number of book (user,item) pairs = 14655
total number of electronics (user,item) pairs = 9762
total number of movies (user,item) pairs = 43155


### Train cluster level latent factor model 

In [5]:
# calculate mse based on predictions matrix and user,item rating pair 
def calc_mse(predictions, pairs): 
    error = 0
    count = 0
    for p in pairs: 
        error += (predictions[p[2],p[3]] - p[4])**2
        
        count += 1
    mse = error/count
    return mse

# train cluster level latent factor model
def train(K, L, T, D1, D2, epochs, train_pairs, test_pairs): 

    # K, number of user clusters
    # L, number of item clusters 
    # T, shared subspace
    # D1, ratings from domain 1: M1 x N1 
    # D2, ratings from domain 2: M2 x N2 
    
    # set mask matrices 
    W1 = (D1!=0).astype(int)
    W2 = (D2!=0).astype(int)
    
    M1 = D1.shape[0] # number of users in domain 1
    M2 = D2.shape[0] # number of users in domain 2
    N1 = D1.shape[1] # number of items in domain 1
    N2 = D1.shape[1] # number of items in domain 2

    # item clusters 
    L1 = L
    L2 = L

    # randomly initialize U, S, and V
    U1 = np.random.rand(M1, K) # M1 x K
    U2 = np.random.rand(M2, K) # M2 x K

    V1 = np.random.rand(N1, L1) # N1 x L1
    V2 = np.random.rand(N2, L2) # N2 x L2 

    S0 = np.random.rand(K, T) # K x T
    S1 = np.random.rand(K, L1-T) # K x L1-T
    S2 = np.random.rand(K, L2-T) # K x L2-T

    
    # train loop 
    for epoch in range(epochs): 
        
        # S1 update equation 
        # V11.T is the laste L1-T columns of V1

        S1 = S1 * np.sqrt((U1.T.dot(np.multiply(D1,W1)).dot(V1[:,T:]) )/
                                     (U1.T.dot(np.multiply(U1.dot(S0).dot(V1[:,:T].T),W1) ).dot(V1[:,T:]) 
                                      + U1.T.dot(np.multiply(U1.dot(S1).dot(V1[:,T:].T),W1) ).dot(V1[:,T:])))
        # S2 update equation 
        S2 = S2 * np.sqrt((U2.T.dot(np.multiply(D2,W2)).dot(V2[:,T:]) )/
                          (U2.T.dot(np.multiply(U2.dot(S0).dot(V2[:,:T].T),W2) ).dot(V2[:,T:]) 
                                      + U2.T.dot(np.multiply(U2.dot(S2).dot(V2[:,T:].T),W2) ).dot(V2[:,T:])))

        # S0 update equation 
        A = (U1.T.dot(np.multiply(U1.dot(S0).dot(V1[:,:T].T),W1) ).dot(V1[:,:T]) 
                                      + U1.T.dot(np.multiply(U1.dot(S1).dot(V1[:,T:].T),W1) ).dot(V1[:,:T]))
        B = (U2.T.dot(np.multiply(U2.dot(S0).dot(V2[:,:T].T),W2) ).dot(V2[:,:T]) 
                                      + U2.T.dot(np.multiply(U2.dot(S2).dot(V2[:,T:].T),W2) ).dot(V2[:,:T]))
        S0 = S0 * np.sqrt( (U1.T.dot(np.multiply(D1,W1)).dot(V1[:,:T]) ) + (U2.T.dot(np.multiply(D2,W2)).dot(V2[:,:T]) ) / (A+B))

        # U1 update equation 
        a = np.multiply(D1, W1).dot(V1).dot(np.concatenate((S0,S1), axis=1).T)
        b = np.multiply(U1.dot(np.concatenate((S0,S1), axis=1)).dot(V1.T),W1).dot(V1).dot(np.concatenate((S0,S1),axis=1).T)
        U1 = U1 * np.sqrt(np.divide(a, b, out=np.zeros_like(a), where=b!=0))

        # U2 update equation 
        U2 = U2 * np.sqrt(np.multiply(D2, W2).dot(V2).dot(np.concatenate((S0,S2), axis=1).T) /
                         np.multiply(U2.dot(np.concatenate((S0,S2), axis=1)).dot(V2.T),W2).dot(V2).dot(np.concatenate((S0,S2),axis=1).T))


        # V1 update equation 
        a = np.concatenate((S0,S1),axis=1).T.dot(U1.T).dot(np.multiply(D1,W1))
        b = np.concatenate((S0,S1),axis=1).T.dot(U1.T).dot(np.multiply(U1.dot(np.concatenate((S0,S1),axis=1)).dot(V1.T),W1))
        V1 = V1 * np.sqrt(np.divide(a,b, out=np.zeros_like(a), where=b!=0)).T

        # V2 update equation 
        a = np.concatenate((S0,S2),axis=1).T.dot(U2.T).dot(np.multiply(D2,W2))
        b = np.concatenate((S0,S2),axis=1).T.dot(U2.T).dot(np.multiply(U2.dot(np.concatenate((S0,S2),axis=1)).dot(V2.T),W2))
        V2 = V2 * np.sqrt(np.divide(a,b, out=np.zeros_like(a), where=b!=0)).T
        
        # D1_predictions = U1.dot(np.concatenate((S0,S1), axis=1)).dot(V1.T)
        
        # make predictions on D2 
        D2_predictions = U2.dot(np.concatenate((S0,S2), axis=1)).dot(V2.T)
        
        # get mse for train and test set 
        train_mse = calc_mse(D2_predictions, train_pairs)
        test_mse = calc_mse(D2_predictions, test_pairs)
        
        # report results 
        print("epoch = %d, train mse = %.4f, test mse = %.4f" % (epoch, train_mse, test_mse))
        
    return U1, U2, S0, S1, S2, V1, V2

In [11]:
K = 50 # user clusters 
L = 50 # number of item clusters 
T = 40 # shared subspace 

U1, U2, S0, S1, S2, V1, V2 = train(K, L, T, D_b, D_e, 200, e_train_pairs, e_test_pairs)


epoch = 0, train mse = 1237.3494, test mse = 1219.8556
epoch = 1, train mse = 43.3873, test mse = 43.3918
epoch = 2, train mse = 2.3058, test mse = 2.6804
epoch = 3, train mse = 0.5786, test mse = 0.9417
epoch = 4, train mse = 0.5346, test mse = 0.8949
epoch = 5, train mse = 0.5370, test mse = 0.9001
epoch = 6, train mse = 0.5354, test mse = 0.9019
epoch = 7, train mse = 0.5334, test mse = 0.9028
epoch = 8, train mse = 0.5319, test mse = 0.9036
epoch = 9, train mse = 0.5310, test mse = 0.9043
epoch = 10, train mse = 0.5304, test mse = 0.9050
epoch = 11, train mse = 0.5300, test mse = 0.9055
epoch = 12, train mse = 0.5297, test mse = 0.9059
epoch = 13, train mse = 0.5294, test mse = 0.9062
epoch = 14, train mse = 0.5292, test mse = 0.9064
epoch = 15, train mse = 0.5290, test mse = 0.9066
epoch = 16, train mse = 0.5287, test mse = 0.9067
epoch = 17, train mse = 0.5285, test mse = 0.9067
epoch = 18, train mse = 0.5283, test mse = 0.9068
epoch = 19, train mse = 0.5281, test mse = 0.9068
ep

epoch = 168, train mse = 0.4339, test mse = 0.9024
epoch = 169, train mse = 0.4323, test mse = 0.9026
epoch = 170, train mse = 0.4307, test mse = 0.9028
epoch = 171, train mse = 0.4291, test mse = 0.9031
epoch = 172, train mse = 0.4275, test mse = 0.9034
epoch = 173, train mse = 0.4258, test mse = 0.9036
epoch = 174, train mse = 0.4242, test mse = 0.9040
epoch = 175, train mse = 0.4225, test mse = 0.9043
epoch = 176, train mse = 0.4208, test mse = 0.9046
epoch = 177, train mse = 0.4191, test mse = 0.9050
epoch = 178, train mse = 0.4174, test mse = 0.9054
epoch = 179, train mse = 0.4157, test mse = 0.9058
epoch = 180, train mse = 0.4140, test mse = 0.9062
epoch = 181, train mse = 0.4122, test mse = 0.9067
epoch = 182, train mse = 0.4104, test mse = 0.9072
epoch = 183, train mse = 0.4087, test mse = 0.9077
epoch = 184, train mse = 0.4069, test mse = 0.9082
epoch = 185, train mse = 0.4051, test mse = 0.9087
epoch = 186, train mse = 0.4032, test mse = 0.9093
epoch = 187, train mse = 0.4014