In [50]:
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

In [51]:
import numpy as np
import pickle
# from cdl import CDL


def preprocess_data():
    import numpy as np
    import pickle

    # EMBEDDING SIZE
    with open(r"./data/citeulike/citeulike-a/vocabulary.dat") as vocabulary_file:
        embedding_size = len(vocabulary_file.readlines())

    # Create Item Matrix 
    with open(r"./data/citeulike/citeulike-a/mult.dat") as item_info_file:
        # initialize item matrix (16980 , 8000)
        item_size = len(item_info_file.readlines())
        item_bow = np.zeros((item_size , embedding_size))

        sentences = item_info_file.readlines()
        for index,sentence in enumerate(sentences):
            words = sentence.strip().split(" ")[1:]
            for word in words:
                vocabulary_index , number = word.split(":")
                item_bow[index][int(vocabulary_index)] = number

    #find user_size = 5551
    with open(r"./data/citeulike/citeulike-a/users.dat") as rating_file:
        user_size = len(rating_file.readlines())

    #initialize rating_matrix (5551 , 16980)
    import numpy as np
    rating_matrix = np.zeros((user_size , item_size))

    #build rating_matrix
    with open(r"./data/citeulike/citeulike-a/users.dat") as rating_file:
        lines = rating_file.readlines()
        for index,line in enumerate(lines):
            items = line.strip().split(" ")
            for item in items:  
                rating_matrix[index][int(item)] = 1

    with open(r'./data/citeulike/citeulike-a/item_bow.pickle', 'wb') as handle:
        pickle.dump(item_bow, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(r'./data/citeulike/citeulike-a/rating_matrix.pickle', 'wb') as handle:
        pickle.dump(rating_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)


try:
    print('Loading item data...')
    with open(r'./data/citeulike/citeulike-a/item_bow.pickle', 'rb') as handle:
        item_matrix = pickle.load(handle) 
    print('Loading rating matrix...')
    with open(r'./data/citeulike/citeulike-a/rating_matrix.pickle', 'rb') as handle2:
        rating_matrix = pickle.load(handle2)
except:
    print('preprocessing data...')
    preprocess_data()
    with open(r'./data/citeulike/citeulike-a/item_bow.pickle', 'rb') as handle:
        item_matrix = pickle.load(handle) 
    with open(r'./data/citeulike/citeulike-a/rating_matrix.pickle', 'rb') as handle2:
        rating_matrix = pickle.load(handle2)


Loading item data...
Loading rating matrix...


In [None]:
class Autoencoder(Model):
    def __init__(self, input_dim, latent_dim, encoder_activation, decoder_activation):
        super(Autoencoder, self).__init__()
        self.input_dim = input_dim
        self.latent_dim = latent_dim
        self.encoder_activation = encoder_activation
        self.decoder_activation = decoder_activation

        self.encoder = tf.keras.Sequential([
            layers.Dense(latent_dim, activation=self.encoder_activation),
        ])
        self.decoder = tf.keras.Sequential([
            layers.Dense(input_dim, activation=self.decoder_activation),
        ])

    def encode(self, x):
        return self.encoder(x)

    def decode(self, x):
        return self.decoder(x)

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [52]:
class SDAE(Model):
    def __init__(self, ae_layers):
        super(SDAE, self).__init__()
        self.ae_layers = ae_layers
        self.models = []

    def make(self):
        for i, layer in enumerate(self.ae_layers[:-1]):
            print('building layer input {} output {}'.format(layer, self.ae_layers[i+1]))
            m = Autoencoder(layer, self.ae_layers[i+1], 'relu', 'sigmoid')
            # m.compile(optimizer='adam', loss=losses.MeanSquaredError())
            self.models.append(Autoencoder(layer, self.ae_layers[i+1], 'relu', 'sigmoid'))

    def call(self, train, test, epochs):
        train_set = train
        test_set = test

        for m in self.models:
            m.compile(optimizer='adam', loss=losses.MeanSquaredError())
            m.fit(train_set, train_set, epochs=epochs, shuffle=True, validation_data=(test_set, test_set))
            train_set = m.encode(train_set)
            test_set = m.encode(test_set)

    def get_layers(self):
        model_layers = []
        for m in self.models:
            w = m.get_weights()
            layer_dict = {
                'w1': w[0],
                'b1': w[1],
                'w2': w[2],
                'b2': w[3]
            }
            model_layers.append(layer_dict)
        return model_layers

In [53]:
from cdl import CDL

SPLIT = 0.8 #80/20
split = int(item_matrix.shape[0] * SPLIT)

x_train2 = item_matrix[:split]
x_test2 = item_matrix[split:]

ae_layers = [8000, 64, 16]
sdae = SDAE(ae_layers)
sdae.make()
sdae.call(x_train2, x_test2, epochs=2)
trained_model = sdae.get_layers()

result_directory = 'results/test3'
cdl = CDL(rating_matrix, item_matrix, lambda_u=1, lambda_v=10, lambda_w=10, lv=0.01, K=K, epochs=10, batch=batch, 
        dir_save=result_directory, dropout=dropout, recall_m=100, trained_matrix=trained_model, pretrain=1
    )
cdl.build_model()
cdl.training(rating_matrix)

building layer input 8000 output 64
building layer input 64 output 16
Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


NameError: name 'CDL' is not defined

In [None]:
#### VARIABLES AS DECLARED IN MATLAB CODE FOR CDL ####
layers = [8000, 200, 50]
lv = 10
lu = 1
ln = 1e3
pretrain = 1
dropout = 0.1
sdae_epochs = 10 # may be a min epochs
minibatch = 128 #or 256
tanh = 1 # used for all layers except the first

# sdae parameters for pretrain
pretrain_learning_rate = 1e-5 # if tanh, else lr = 1e-1
use_adadelta = 0
learning_rate0 = 5000 # not sure what this is
weight_decay = 1e-4
min_epochs = 10
sparsity_cost = 0.1
epsilon = 1e-8
momentum = 0.99

# main cdl
learning_rate = 1e-6 #if tanh, else lr = 1e-1
learning_rate0 = 5000
use_adadelta = 0

# use dropout on all but the bottleneck layer
dropout = 0.1 #paper
noise = 0.3 #paper
