In [None]:
from google.colab import drive
from tqdm import tqdm_notebook as tqdm

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


In [None]:
%cd /content/gdrive/MyDrive/'10701-project'

/content/gdrive/MyDrive/10701-project


In [None]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from collections import defaultdict
import tensorflow as tf
import keras

In [None]:
def get_volcabulary_and_list_words(data):
    reviews_words = []
    volcabulary = defaultdict(int)
    for review in data["text"]:
        # review_words = Word2VecUtility.review_to_wordlist(review, remove_stopwords=True)
        review_words = review.split()
        reviews_words.append(review_words)
        for word in review_words:
            volcabulary[word] += 1
    return volcabulary, reviews_words

def get_reviews_word_index(reviews_words, volcabulary, max_words, max_length, word2index):
    volcabulary = sorted(volcabulary.items(), key = lambda x : x[1], reverse = True)[:max_words]
    reviews_words_index = [[start] + [(word2index[w] + index_from) if w in word2index else oov for w in review] for review in reviews_words]
    # in word2vec embedding, use (i < max_words + index_from) because we need the exact index for each word, in order to map it to its vector. And then its max_words is 5003 instead of 5000.
    # padding with 0, each review has max_length now.
    reviews_words_index = sequence.pad_sequences(reviews_words_index, maxlen=max_length, padding='post', truncating='post')
    return reviews_words_index

In [None]:
max_words = 5000
max_length = 50

# model training parameters
batch_size = 32
embedding_dims = 100
nb_filter = 256
filter_length = 3
hidden_dims = 256
meta_hidden_size = 512
nb_epoch = 5
num_classes = 2 # Change to 5 for 5-star

index_from = 3
start = 1
# padding = 0
oov = 2

In [None]:
train_data = pd.read_csv('binary_data/train.csv')
val_data = pd.read_csv('binary_data/val.csv')
test_data = pd.read_csv('binary_data/test.csv')
data = pd.concat([train_data, val_data, test_data])
print('get volcabulary...')
volcabulary, reviews_words = get_volcabulary_and_list_words(data)
print('get reviews_words_index...')


f = open('./glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	word_embed_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(word_embed_index))
# create a weight matrix for words in training docs
word_embed_matrix = zeros((len(vocabulary), embedding_dims))
for id, word in enumerate(vocabulary.items()):
	word = word[0]
	word_embed_vector = word_embed_index.get(word)
	if word_embed_vector is not None:
		word_embed_matrix[i] = word_embed_vector

reviews_words_index = get_reviews_word_index(reviews_words, volcabulary, max_words, max_length, word_embed_index)

labels = data["stars"].values

In [None]:
train_data = [reviews_words_index[:len(train_data)], train_data["dense_feats"].values]
valid_data = [reviews_words_index[len(train_data):len(train_data)+len(val_data)], val_data["dense_feats"].values]
test_data = [reviews_words_index[len(train_data)+len(val_data):], test_data["dense_feats"].values]
train_labels = [labels[:len(train_data)], data["sentiment_score"].values[:len(train_data)]]
valid_labels = [labels[len(train_data):len(train_data)+len(val_data)], data["sentiment_score"].values[len(train_data): len(train_data)+len(val_data)]]
test_labels = [labels[len(train_data)+len(val_data):], data["sentiment_score"].values[len(train_data)+len(val_data):]]

In [None]:
text_input = Input(shape=(max_length,), name='text_input')
meta_input = Input(shape=(dense_size,), name='meta_input')
text_embed = Embedding(max_words + index_from, embedding_dims, \
                    input_length=max_length, weights=[word_embed_matrix])(text_input)
# Text encoder
text_conv = Convolution1D(filters=nb_filter,
                        kernel_size=filter_length,
                        padding='valid',
                        activation='relu',
                        strides=1)(text_embed)
text_pool = MaxPooling1D(pool_size=2)(text_conv)
text_flatten = Flatten()(text_pool)
# Meta encoder
meta_dense = Dense(meta_hidden_size, activation="relu")(meta_input)
meta_repr = Dropout(0.25)(meta_dense)

# Concate
concat = Concatenate([meta_repr, text_flatten])
concat_dense = Dense(hidden_dims , activation="relu")(concat)
concat_repr = Dropout(0.25)(concat_dense)
# MTL heads
star_classify = Dense(num_classes, activation='softmax', name="star_classify")(concat_repr)
sent_classify = Dense(1, name = "sent_classify")(concat_repr)
model = Model([text_input, meta_input], [star_classify, sent_classify])

In [None]:
model.compile(loss={'star_classify': 'binary_crossentropy', 'sent_classify': 'mean_squared_error'}, 
              loss_weights={'star_classify': 0.9, 'sent_classify': 0.1}, 
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), 
              metrics={'star_classify': 'accuracy'})

model.fit(train_data, train_labels, batch_size=batch_size,
          epochs=nb_epoch,
          validation_data=(valid_data, valid_labels))

In [None]:
preds = model.predict(test_data)
print(confusion_matrix(test_labels,preds))
print("Score:",round(accuracy_score(test_labels,preds)*100,2))
print("Classification Report:",classification_report(test_labels,preds))