In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
cd drive/My Drive/Recsys-2019

/content/drive/My Drive/Recsys-2019


In [0]:
import argparse
import logging
import numpy as np
from time import time

logging.basicConfig(
                    # filename='out.log',
                    level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')
logger = logging.getLogger(__name__)

In [0]:
import pickle  
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def save_we(vocab, weights, domain):
    word_emb = {}
    for i, j in zip(vocab, weights):
        word_emb[i] = j
    if domain == 'amazon_electronics':
        save_obj(word_emb, 'data/word_emb_lt')
    else:
        save_obj(word_emb, 'data/word_emb_res')

In [0]:
import codecs
import operator
import numpy as np
import re

num_regex = re.compile('^[+-]?[0-9]+\.?[0-9]*$')

def create_vocab(domain, maxlen=0, vocab_size=0):
    
    print('Creating vocab ...')

    f = 'data/doc_level/%s_text.txt'%(domain)

    total_words, unique_words = 0, 0
    word_freqs = {}

    fin = codecs.open(f, 'r', 'utf-8')
    for line in fin:
        words = line.split()
        if maxlen > 0 and len(words) > maxlen:
            continue

        for w in words:
            if not bool(num_regex.match(w)):
                try:
                    word_freqs[w] += 1
                except KeyError:
                    unique_words += 1
                    word_freqs[w] = 1
                total_words += 1

    print ('  %i total words, %i unique words' % (total_words, unique_words))
    sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

    vocab = {'<pad>':0, '<unk>':1, '<num>':2}
    index = len(vocab)
    for word, _ in sorted_word_freqs:
        vocab[word] = index
        index += 1
        if vocab_size > 0 and index > vocab_size + 2:
            break
    if vocab_size > 0:
        print (' keep the top %i words' % vocab_size)

    #Write vocab to a txt file
    # vocab_file = codecs.open(domain+'_vocab', mode='w', encoding='utf8')
    # sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1))
    # for word, index in sorted_vocab:
    #     vocab_file.write(word+'\t'+str(index)+'\n')
    # vocab_file.close()

    return vocab


def create_data(vocab, text_path, label_path, domain, skip_top, skip_len, replace_non_vocab):
    data = []
    label = [] # {pos: 0, neg: 1, neu: 2}
    f = codecs.open(text_path, 'r', 'utf-8')
    f_l = codecs.open(label_path, 'r', 'utf-8')
    num_hit, unk_hit, skip_top_hit, total = 0., 0., 0., 0.
    pos_count, neg_count, neu_count = 0, 0, 0
    max_len = 0

    for line, score in zip(f, f_l):
        word_indices = []
        words = line.split()
        if skip_len > 0 and len(words) > skip_len:
            continue

        score = float(score.strip())
        if score < 3:
            neg_count += 1
            label.append(1)
        elif score > 3:
            pos_count += 1
            label.append(0)
        else:
            neu_count += 1
            label.append(2)
          
        for word in words:
            if bool(num_regex.match(word)):
                word_indices.append(vocab['<num>'])
                num_hit += 1
            elif word in vocab:
                word_ind = vocab[word]
                if skip_top > 0 and word_ind < skip_top + 3:
                    skip_top_hit += 1
                else:
                    word_indices.append(word_ind)
            else:
                if replace_non_vocab:
                    word_indices.append(vocab['<unk>'])
                unk_hit += 1
            total += 1

        if len(word_indices) > max_len:
            max_len = len(word_indices)

        data.append(word_indices)

    f.close()
    f_l.close()

    print('  <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))

    print (domain)
    print( 'pos count: ', pos_count )
    print( 'neg count: ', neg_count )
    print( 'neu count: ', neu_count )

    return np.array(data), np.array(label), max_len



def prepare_data(domain, vocab_size, skip_top=0, skip_len=0, replace_non_vocab=1):

    assert domain in ['amazon_electronics', 'yelp14']

    vocab = create_vocab(domain, skip_len, vocab_size)

    text_path = 'data/doc_level/%s_text.txt'%(domain)
    score_path = 'data/doc_level/%s_label.txt'%(domain)

    data, label, max_len = create_data(vocab, text_path, score_path, domain, skip_top, skip_len, replace_non_vocab)

    return vocab, data, label, max_len


### For doc-level/amazon_electronics

In [0]:
vocab_am, data_list_am, label_list_am, overall_maxlen_am = prepare_data('amazon_electronics', 10000)

Creating vocab ...
  3440972 total words, 39122 unique words
 keep the top 10000 words
  <num> hit rate: 1.04%, <unk> hit rate: 1.56%
amazon_electronics
pos count:  10000
neg count:  10000
neu count:  10000


In [0]:
idx_words_am = dict((v,k) for (k,v) in vocab_am.items())

###  For doc-level/yelp_2014

In [0]:
vocab_yelp, data_list_yelp, label_list_yelp, overall_maxlen_yelp = prepare_data('yelp14', 10000)

Creating vocab ...
  3829257 total words, 43607 unique words
 keep the top 10000 words
  <num> hit rate: 0.87%, <unk> hit rate: 2.05%
yelp14
pos count:  10000
neg count:  10000
neu count:  10000


In [0]:
idx_words_yelp = dict((v,k) for (k,v) in vocab_yelp.items())

### Model

In [0]:
from keras.layers import Dense, Dropout, Activation, Embedding, LSTM, Input
from keras.models import Model

In [0]:
###################################
## Create Model
#

dropout = 0.5    
recurrent_dropout = 0.1    
#vocab_size = len(vocab_am)
vocab_size = len(vocab_yelp)

##### Inputs #####
sentence_input = Input(shape=(None,), dtype='int32', name='sentence_input')

word_emb = Embedding(vocab_size, 300, mask_zero=True, name='word_emb')
output = word_emb(sentence_input)

print ('use a rnn layer')
output = LSTM(300, return_sequences=False, dropout=dropout, recurrent_dropout=recurrent_dropout, name='lstm')(output)

print ('use 0.5 dropout layer')
output = Dropout(0.5)(output)

densed = Dense(3, name='dense')(output)
probs = Activation('softmax')(densed)
model = Model(inputs=[sentence_input], outputs=probs)

Instructions for updating:
Colocations handled automatically by placer.


use a rnn layer


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


use 0.5 dropout layer


In [0]:
import keras.optimizers as opt

In [0]:
optimizer = opt.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06, clipnorm=10, clipvalue=0)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sentence_input (InputLayer)  (None, None)              0         
_________________________________________________________________
word_emb (Embedding)         (None, None, 300)         3000900   
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 3)                 903       
_________________________________________________________________
activation_1 (Activation)    (None, 3)                 0         
Total params: 3,723,003
Trainable params: 3,723,003
Non-trainable params: 0
_________________________________________________________________


In [0]:
def batch_generator(data1, data2, batch_size):
    len_ = len(data1)
    while True:
        indices = np.random.choice(len_, batch_size)
        x = data1[indices]
        y = data2[indices]

        maxlen = np.max([len(d) for d in x])
        x = sequence.pad_sequences(x, maxlen)
        yield x, y

### Training and validation set

In [0]:
from tqdm import tqdm

### For amazon_electronics

In [0]:
from keras.preprocessing import sequence
from keras.utils.np_utils import to_categorical


In [0]:
rand_am = np.arange(len(data_list_am))
np.random.shuffle(rand_am)

data_list_am = data_list_am[rand_am]
label_list_am = to_categorical(label_list_am)[rand_am]
data_size_am = len(data_list_am)

dev_x_am = data_list_am[0:1000]
dev_y_am = label_list_am[0:1000]
train_x_am = data_list_am[1000:int(data_size_am)]
train_y_am = label_list_am[1000:int(data_size_am)]

maxlen_am = np.max([len(d) for d in dev_x_am])
dev_x_am = sequence.pad_sequences(dev_x_am, maxlen_am)

import operator
vocab_list_am = [x for (x, _) in sorted(vocab_am.items(), key=operator.itemgetter(1))]

In [0]:
train_gen_am = batch_generator(train_x_am, train_y_am, batch_size=50)
batches_per_ep_am = len(train_x_am) / 50

In [0]:
domain = 'amazon_electronics'
best_acc = 0
best_loss = 100
for ii in range(10):
    t0 = time()
    loss, metric = 0., 0.

    for b in tqdm(range(int(batches_per_ep_am))):
        batch_x,  batch_y = train_gen_am.__next__()
        loss_, metric_ = model.train_on_batch([batch_x], batch_y)
        loss += loss_ / batches_per_ep_am
        metric += metric_ / batches_per_ep_am

    tr_time = time() - t0

    dev_loss, dev_metric = model.evaluate([dev_x_am], dev_y_am, batch_size=50)

    logger.info('Epoch %d, train: %is' % (ii, tr_time))
    logger.info('[Train] loss: %.4f, metric: %.4f' % (loss, metric))
    logger.info('[Dev] loss: %.4f, metric: %.4f' % (dev_loss, dev_metric))

    if dev_metric > best_acc:
      
        best_acc = dev_metric
        word_emb = model.get_layer('word_emb').get_weights()[0]
        lstm_weights = model.get_layer('lstm').get_weights()
        dense_weights = model.get_layer('dense').get_weights()

        save_we(vocab_list_am, word_emb, domain)

        if domain == 'amazon_electronics':
            save_obj(lstm_weights, 'data/lstm_weights_lt')
            save_obj(dense_weights, 'data/dense_weights_lt')
        else:
            save_obj(lstm_weights, 'data/lstm_weights_res')
            save_obj(dense_weights, 'data/dense_weights_res')

        print ('------- Saved Weights -------')

Instructions for updating:
Use tf.cast instead.
100%|██████████| 580/580 [15:09<00:00,  1.66s/it]




2019-04-06 21:23:33,334 INFO Epoch 0, train: 909s
2019-04-06 21:23:33,340 INFO [Train] loss: 0.8933, metric: 0.5795
2019-04-06 21:23:33,342 INFO [Dev] loss: 0.8037, metric: 0.6110
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [14:59<00:00,  1.51s/it]




2019-04-06 21:38:48,160 INFO Epoch 1, train: 899s
2019-04-06 21:38:48,162 INFO [Train] loss: 0.7343, metric: 0.6819
2019-04-06 21:38:48,163 INFO [Dev] loss: 0.7622, metric: 0.6660
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [15:00<00:00,  1.27s/it]




2019-04-06 21:54:02,915 INFO Epoch 2, train: 900s
2019-04-06 21:54:02,916 INFO [Train] loss: 0.6735, metric: 0.7171
2019-04-06 21:54:02,924 INFO [Dev] loss: 0.7839, metric: 0.6330
 93%|█████████▎| 540/580 [14:06<01:08,  1.72s/it]

### For Yelp_2014

In [0]:
rand_yelp = np.arange(len(data_list_yelp))
np.random.shuffle(rand_yelp)

data_list_yelp = data_list_yelp[rand_yelp]
label_list_yelp = to_categorical(label_list_yelp)[rand_yelp]
data_size_yelp = len(data_list_yelp)

dev_x_yelp = data_list_yelp[0:1000]
dev_y_yelp = label_list_yelp[0:1000]
train_x_yelp = data_list_yelp[1000:int(data_size_yelp)]
train_y_yelp = label_list_yelp[1000:int(data_size_yelp)]

maxlen_yelp = np.max([len(d) for d in dev_x_yelp])
dev_x_yelp = sequence.pad_sequences(dev_x_yelp, maxlen_yelp)

import operator
vocab_list_yelp = [x for (x, _) in sorted(vocab_yelp.items(), key=operator.itemgetter(1))]

In [0]:
train_gen_yelp = batch_generator(train_x_yelp, train_y_yelp, batch_size=50)
batches_per_ep_yelp = len(train_x_yelp) / 50

In [0]:
domain = 'yelp14'
best_acc = 0
best_loss = 100
for ii in range(10):
    t0 = time()
    loss, metric = 0., 0.

    for b in tqdm(range(int(batches_per_ep_yelp))):
        batch_x,  batch_y = train_gen_yelp.__next__()
        loss_, metric_ = model.train_on_batch([batch_x], batch_y)
        loss += loss_ / batches_per_ep_yelp
        metric += metric_ / batches_per_ep_yelp

    tr_time = time() - t0

    dev_loss, dev_metric = model.evaluate([dev_x_yelp], dev_y_yelp, batch_size=50)

    logger.info('Epoch %d, train: %is' % (ii, tr_time))
    logger.info('[Train] loss: %.4f, metric: %.4f' % (loss, metric))
    logger.info('[Dev] loss: %.4f, metric: %.4f' % (dev_loss, dev_metric))

    if dev_metric > best_acc:
      
        best_acc = dev_metric
        word_emb = model.get_layer('word_emb').get_weights()[0]
        lstm_weights = model.get_layer('lstm').get_weights()
        dense_weights = model.get_layer('dense').get_weights()

        save_we(vocab_list_yelp, word_emb, domain)

        if domain == 'amazon_electronics':
            save_obj(lstm_weights, 'data/lstm_weights_lt')
            save_obj(dense_weights, 'data/dense_weights_lt')
        else:
            save_obj(lstm_weights, 'data/lstm_weights_res')
            save_obj(dense_weights, 'data/dense_weights_res')

        print ('------- Saved Weights -------')

Instructions for updating:
Use tf.cast instead.
100%|██████████| 580/580 [16:29<00:00,  1.49s/it]




2019-04-07 07:46:55,493 INFO Epoch 0, train: 989s
2019-04-07 07:46:55,501 INFO [Train] loss: 0.7760, metric: 0.6606
2019-04-07 07:46:55,503 INFO [Dev] loss: 0.6562, metric: 0.7190
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [15:56<00:00,  1.87s/it]




2019-04-07 08:03:06,841 INFO Epoch 1, train: 956s
2019-04-07 08:03:06,845 INFO [Train] loss: 0.6060, metric: 0.7519
2019-04-07 08:03:06,846 INFO [Dev] loss: 0.6630, metric: 0.7050
100%|██████████| 580/580 [16:26<00:00,  1.31s/it]




2019-04-07 08:19:47,893 INFO Epoch 2, train: 986s
2019-04-07 08:19:47,895 INFO [Train] loss: 0.5439, metric: 0.7796
2019-04-07 08:19:47,903 INFO [Dev] loss: 0.6130, metric: 0.7490
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [16:15<00:00,  1.67s/it]




2019-04-07 08:36:17,500 INFO Epoch 3, train: 975s
2019-04-07 08:36:17,501 INFO [Train] loss: 0.4892, metric: 0.8074
2019-04-07 08:36:17,507 INFO [Dev] loss: 0.5719, metric: 0.7610
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [15:51<00:00,  1.83s/it]




2019-04-07 08:52:24,018 INFO Epoch 4, train: 951s
2019-04-07 08:52:24,020 INFO [Train] loss: 0.4421, metric: 0.8263
2019-04-07 08:52:24,026 INFO [Dev] loss: 0.5914, metric: 0.7570
100%|██████████| 580/580 [16:05<00:00,  1.66s/it]




2019-04-07 09:08:44,404 INFO Epoch 5, train: 965s
2019-04-07 09:08:44,411 INFO [Train] loss: 0.4004, metric: 0.8468
2019-04-07 09:08:44,413 INFO [Dev] loss: 0.5939, metric: 0.7610
100%|██████████| 580/580 [16:00<00:00,  1.47s/it]




2019-04-07 09:24:59,704 INFO Epoch 6, train: 960s
2019-04-07 09:24:59,706 INFO [Train] loss: 0.3674, metric: 0.8598
2019-04-07 09:24:59,707 INFO [Dev] loss: 0.6055, metric: 0.7690
  0%|          | 0/580 [00:00<?, ?it/s]

------- Saved Weights -------


100%|██████████| 580/580 [16:23<00:00,  1.78s/it]




2019-04-07 09:41:38,383 INFO Epoch 7, train: 983s
2019-04-07 09:41:38,385 INFO [Train] loss: 0.3264, metric: 0.8795
2019-04-07 09:41:38,392 INFO [Dev] loss: 0.6629, metric: 0.7430
100%|██████████| 580/580 [16:40<00:00,  1.66s/it]




2019-04-07 09:58:34,159 INFO Epoch 8, train: 1000s
2019-04-07 09:58:34,164 INFO [Train] loss: 0.2954, metric: 0.8916
2019-04-07 09:58:34,166 INFO [Dev] loss: 0.6792, metric: 0.7600
100%|██████████| 580/580 [16:29<00:00,  1.69s/it]




2019-04-07 10:15:19,267 INFO Epoch 9, train: 989s
2019-04-07 10:15:19,268 INFO [Train] loss: 0.2879, metric: 0.8964
2019-04-07 10:15:19,273 INFO [Dev] loss: 0.6725, metric: 0.7650
