In [1]:
import json
import os
import re
from itertools import chain
from collections import Counter
from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import scipy.sparse as spr
import tensorflow as tf

tf.__version__

'2.2.0'

In [2]:
train = pd.read_json('/Users/michelle/Data Science/Melon Playlist Continuation/train.json', typ = 'frame')
test = pd.read_json('/Users/michelle/Data Science/Melon Playlist Continuation/val.json', typ= 'frame')

In [3]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [4]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
  tag_id_tid[t] = i
  tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
  song_id_sid[t] = i
  song_sid_id[i] = t

n_songs = len(song_dict)

In [5]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [6]:
plylst_use = plylst.loc[:,['istrain','nid','updt_date','songs_id','tags_id']]
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

In [7]:
row = np.repeat(range(n_train+n_test), plylst_use['num_songs'])
col = [song for songs in plylst_use['songs_id'] for song in songs]
dat = np.repeat(1, plylst_use['num_songs'].sum())
all_songs = spr.csr_matrix((dat, (row, col)), shape=(n_train+n_test, n_songs))

In [8]:
all_songs.shape

(138086, 638336)

In [9]:
len(train) + len(test)

138086

In [10]:
train_songs = all_songs[:len(train), :]

# DAE modeling

In [33]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    # coo = coo.tolil()  ## tolil로 먼저 변경하기
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensor(indices, coo.data, coo.shape)

In [34]:
train_x = convert_sparse_matrix_to_sparse_tensor(train_songs)
# kernel dies: train_x = tf.sparse_tensor_to_dense(train_x)
train_y = train_x

In [35]:
train_x.shape

TensorShape([115071, 638336])

In [11]:
class SparseSequence(tf.keras.utils.Sequence):
    def __init__(self, x_vals, y_vals, batch_size = 32):
        self.x_vals = x_vals
        self.y_vals = y_vals
        self.inds = list(range(x_vals.shape[0]))
        shuffle(self.inds)
        self.batch_size = batch_size
    def __getitem__(self, item):
        from_ind = self.batch_size * item
        to_ind = self.batch_size * (item + 1)
        return (self.x_vals[self.inds[from_ind:to_ind], :].todense(),
                y_vals[self.inds[from_ind:to_ind]])
    def on_epoch_end(self):
        shuffle(self.inds)
    def __len__(self):
        return math.ceil(self.x_vals.shape[0] / self.batch_size)

In [12]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import Dropout

In [13]:
## layer parameters
noise_level = 1.0
n_inputs = 638336
n_hidden1 = 256  # encoder
n_hidden2 = 128  # coding units
n_hidden3 = n_hidden1
n_outputs = n_inputs

## train parameters
dropout_rate = 0.3
learning_rate = 0.01
n_epochs = 5
batch_size = 1

In [14]:
model = Sequential([
    Dropout(dropout_rate, input_shape=(1, n_inputs)),
    Dense(n_hidden1, activation='sigmoid', name='hidden1'),
    Dense(n_hidden2, activation='sigmoid', name='hidden2'),
    Dense(n_hidden3, activation='sigmoid', name='hidden3'),
    Dense(n_outputs, name='outputs')
                   ])

In [31]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 1, 638336)         0         
_________________________________________________________________
hidden1 (Dense)              (None, 1, 1024)           653657088 
_________________________________________________________________
hidden2 (Dense)              (None, 1, 512)            524800    
_________________________________________________________________
hidden3 (Dense)              (None, 1, 1024)           525312    
_________________________________________________________________
outputs (Dense)              (None, 1, 638336)         654294400 
Total params: 1,309,001,600
Trainable params: 1,309,001,600
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(
    optimizer='adam',
    loss='mean_squared_error',
    metrics=['accuracy']
)

In [19]:
from random import shuffle
import math

In [21]:
def data_generator(x_vals, y_vals):
    inds = list(range(x_vals.shape[0]))
    shuffle(inds)
    for ind in inds:
        yield (x_vals[ind, :].todense(), y_vals[ind])

In [23]:
print("Fit model on training data")
autoencoder = model.fit(
    data_generator(train_songs, train_songs),
    batch_size = batch_size,
    epochs = n_epochs,
    # We pass some validation for monitoring validation loss and metrics at the end of each epoch
    # validation_data=(x_val, y_val)
)

Fit model on training data
Epoch 1/5


InvalidArgumentError:  TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int64, but the yielded element was   (0, 1840)	1
  (0, 2183)	1
  (0, 2204)	1
  (0, 2205)	1
  (0, 4659)	1
  (0, 18050)	1
  (0, 21613)	1
  (0, 37266)	1.
TypeError: int() argument must be a string, a bytes-like object or a number, not 'csr_matrix'


The above exception was the direct cause of the following exception:


Traceback (most recent call last):

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 801, in generator_py_func
    ret, dtype=dtype.as_numpy_dtype))

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 203, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

ValueError: setting an array element with a sequence.


During handling of the above exception, another exception occurred:


Traceback (most recent call last):

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 243, in __call__
    ret = func(*args)

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/autograph/impl/api.py", line 309, in wrapper
    return func(*args, **kwargs)

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 806, in generator_py_func
    "element was %s." % (dtype.name, ret)), sys.exc_info()[2])

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/six.py", line 702, in reraise
    raise value.with_traceback(tb)

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 801, in generator_py_func
    ret, dtype=dtype.as_numpy_dtype))

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/script_ops.py", line 203, in _convert
    result = np.asarray(value, dtype=dtype, order="C")

  File "/Users/michelle/opt/anaconda3/lib/python3.7/site-packages/numpy/core/_asarray.py", line 83, in asarray
    return array(a, dtype, copy=False, order=order)

TypeError: `generator` yielded an element that could not be converted to the expected type. The expected type was int64, but the yielded element was   (0, 1840)	1
  (0, 2183)	1
  (0, 2204)	1
  (0, 2205)	1
  (0, 4659)	1
  (0, 18050)	1
  (0, 21613)	1
  (0, 37266)	1.


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_856]

Function call stack:
train_function


In [None]:
# a = train_songs.toarray()
# kernel dies: ex = tf.convert_to_tensor(a, dtype=tf.float32)

In [15]:
np.random.seed(11)
tf.random.set_seed(11)
batch_size = 1
max_epochs = 50
learning_rate = 1e-3
momentum = 8e-1
hidden_dim = 1024
original_dim = 638336

In [20]:
# Encoder 부분을 Denoising Encoder로 바꾸기 위해 dropout을 추가한다
class Encoder(tf.keras.layers.Layer):
    def __init__(self, hidden_dim):
        super(Encoder, self).__init__()
        #self.dropout_layer = tf.keras.layers.Dropout(rate=0.3)
        self.hidden_layer = tf.keras.layers.Dense(units=hidden_dim, activation=tf.nn.relu)
    
    def call(self, input_features):
        #dropped = self.dropout_layer(input_features)
        #activation = self.hidden_layer(dropped)
        activation = self.hidden_layer(input_features)
        return activation

In [17]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, hidden_dim, original_dim):
        super(Decoder, self).__init__()
        self.output_layer = tf.keras.layers.Dense(units=original_dim, activation=tf.nn.relu)
  
    def call(self, encoded):
        activation = self.output_layer(encoded)
        return activation

In [18]:
class Autoencoder(tf.keras.Model):
    def __init__(self, hidden_dim, original_dim):
        super(Autoencoder, self).__init__()
        self.loss = []
        self.encoder = Encoder(hidden_dim=hidden_dim)
        self.decoder = Decoder(hidden_dim=hidden_dim, original_dim=original_dim)

    def call(self, input_features):
        encoded = self.encoder(input_features)
        reconstructed = self.decoder(encoded)
        return reconstructed

In [None]:
model = Autoencoder(hidden_dim=hidden_dim, original_dim=original_dim)

model.compile(loss='mse', optimizer='adam')

loss = model.fit(train_x, train_y, epochs=max_epochs, batch_size=batch_size)
# noisy data를 따로 만들지 않고 dropout을 시켜도록 수정

**막힌 부분: input을 어떻게 만들어야 하는가? 각 row를 train 하나하나로 넣으려면?  
그리고 그렇게 하면 각 input이 (1, 638336) sparse vector인데, train 시간이 너무 오래 걸리지 않을까?**

# DAE codes

**질문: class 잘 쓸 줄 모름...__init___은 무슨 함수이고, 여기서 self와 conf은 무엇인가?**

In [40]:
class DAE_tied():
    def __init__(self, conf):
        self.save_dir = conf.save

        self.n_batch = conf.batch
        self.n_input = conf.n_input
        self.n_hidden = conf.hidden
        self.learning_rate = conf.lr
        self.reg_lambda = conf.reg_lambda

        self.x_positions = tf.placeholder(dtype=tf.int64,shape=[None,2])
        self.x_ones = tf.placeholder(dtype=tf.float32)

        self.y_positions = tf.placeholder(dtype=tf.int64,shape=[None,2])
        self.y_ones = tf.placeholder(dtype=tf.float32)
        
        self.keep_prob = tf.placeholder(tf.float32, shape=[])
        self.input_keep_prob = tf.placeholder(tf.float32, shape=[])

        with tf.device("/cpu:0"):
            x_sparse = tf.SparseTensor(indices=self.x_positions,
                                       values=self.x_ones,dense_shape=[self.n_batch,self.n_input])
            self.x = tf.sparse_tensor_to_dense(x_sparse, validate_indices=False)
            y_sparse = tf.SparseTensor(indices=self.y_positions,
                                       values=self.y_ones,dense_shape=[self.n_batch,self.n_input])
            self.y = tf.sparse_tensor_to_dense(y_sparse, validate_indices=False)

        x_dropout = tf.nn.dropout(self.x, keep_prob=self.input_keep_prob)  # dropout 찾아보기
        self.reduce_sum = tf.reduce_sum(x_dropout, 1, keepdims=True)
        self.x_dropout = tf.divide(x_dropout, self.reduce_sum + 1e-10)

        self.y_pred = None
        self.cost = None
        self.optimizer = None
        self.init_op = None

        self.weights = {}
        self.biases = {}
        self.d_params = []

    def init_weight(self):
        self.weights['encoder_h'] = tf.get_variable("encoder_h", shape=[self.n_input, self.n_hidden],
                                                    initializer=tf.contrib.layers.xavier_initializer())
        self.biases['encoder_b'] = tf.get_variable(name="encoder_b", shape=[self.n_hidden],
                                                   initializer=tf.zeros_initializer())
        self.biases['decoder_b'] = tf.get_variable(name="decoder_b", shape=[self.n_input],
                                                   initializer=tf.zeros_initializer())
        self.d_params = [self.weights['encoder_h'], self.weights['encoder_h'],
                         self.biases['encoder_b'], self.biases['decoder_b']]

    # Building the encoder
    def encoder(self, x):
        # Encoder Hidden layer with sigmoid activation #1         
        layer = tf.add(tf.matmul(x, self.weights['encoder_h']), self.biases['encoder_b'])
        layer = tf.nn.sigmoid(layer)
        layer = tf.nn.dropout(layer, self.keep_prob)

        return layer

    # Building the decoder
    def decoder(self, x):
        # Decoder Hidden layer with sigmoid activation #1
        layer = tf.nn.sigmoid(tf.add(tf.matmul(x, tf.transpose(self.weights['encoder_h'])),
                                   self.biases['decoder_b']))
        return layer

    def l2_loss(self):
        l2 = tf.nn.l2_loss(self.weights['encoder_h']) + tf.nn.l2_loss(self.biases['decoder_b']) + \
             tf.nn.l2_loss(self.biases['encoder_b'])
        return l2

    def fit(self):
        # Construct model
        with tf.device("/cpu:0"):  #CPU
            self.init_weight()

        encoder_op = self.encoder(self.x_dropout)
        with tf.device("/gpu:1"):  #GPU1
            self.y_pred = self.decoder(encoder_op)

        with tf.device("/cpu:0"):  #CPU
            l2 = self.l2_loss()
            
        # Define loss and optimizer, minimize the squared error
        with tf.device("/gpu:1"): ##SHOULD BE GPU1
            L = -tf.reduce_sum(self.y*tf.log(self.y_pred+1e-10) + 
                               0.55*(1 - self.y)* tf.log(1 - self.y_pred+1e-10),axis = 1)
            self.cost = tf.reduce_mean(L) + self.reg_lambda * l2

        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)

        # Initialize the variables (i.e. assign their default value)
        self.init_op = tf.global_variables_initializer()
        
    def save_model(self, sess):
        param = sess.run(self.d_params)
        output = open(self.save_dir, 'wb')
        pickle.dump(param, output)
        output.close()

In [41]:
class DAE(DAE_tied):
    def __init__(self, conf):
        DAE_tied.__init__(self, conf)
        self.initval_dir = conf.initval

    def init_weight(self):
        if self.initval_dir == 'NULL':
            self.weights['encoder_h'] = tf.get_variable("encoder_h", shape=[self.n_input, self.n_hidden],
                                                        initializer=tf.contrib.layers.xavier_initializer())
            self.weights['decoder_h'] = tf.get_variable("decoder_h", shape=[self.n_input, self.n_hidden],
                                                        initializer=tf.contrib.layers.xavier_initializer())
            self.biases['encoder_b'] = tf.get_variable(name="encoder_b", shape=[self.n_hidden],
                                                       initializer=tf.zeros_initializer())
            self.biases['decoder_b'] = tf.get_variable(name="decoder_b", shape=[self.n_input],
                                                       initializer=tf.zeros_initializer())
        else:
            with open(self.initval_dir, 'rb') as f:
                emb = pickle.load(f)
            self.weights['encoder_h'] = tf.get_variable("encoder_h", initializer=tf.constant(emb[0]))
            self.weights['decoder_h'] = tf.get_variable("decoder_h", initializer=tf.constant(emb[1]))
            self.biases['encoder_b'] = tf.get_variable(name="encoder_b", initializer=tf.constant(emb[2]))
            self.biases['decoder_b'] = tf.get_variable(name="decoder_b", initializer=tf.constant(emb[3]))

        self.d_params = [self.weights['encoder_h'], self.weights['decoder_h'],
                         self.biases['encoder_b'], self.biases['decoder_b']]

    # Building the decoder
    def decoder(self, x):
        # Decoder Hidden layer with sigmoid activation #1
        layer = tf.nn.sigmoid(tf.add(tf.matmul(x, tf.transpose(self.weights['decoder_h'])),
                                     self.biases['decoder_b']))
        return layer

    def l2_loss(self):
        l2 = tf.nn.l2_loss(self.weights['encoder_h']) + tf.nn.l2_loss(self.biases['decoder_b']) + \
             tf.nn.l2_loss(self.biases['encoder_b']) + tf.nn.l2_loss(self.weights['decoder_h'])
        return l2