In [142]:
import numpy as np
import string
import pandas as pd
import nltk
import keras
import io
nltk.download('punkt')
nltk.download('stopwords')
from sklearn import random_projection
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords

from google.colab import auth
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from keras.models import Sequential, Input, Model
from keras.layers import Embedding, Dense, Dropout, LSTM, Concatenate, CuDNNLSTM, Conv2D, Reshape, MaxPool2D, Flatten
from keras.optimizers import SGD
from keras import metrics
import datetime

stop_words = set(stopwords.words('english') + list(string.punctuation))

auth.authenticate_user()
drive_service = build('drive', 'v3')

folder_id = '1pQymPM0o_hYssoMvmLonRi5SNpUeOp0j'
params = {}
children = drive_service.files().list(q="'" + folder_id + "' in parents").execute()
for child in children.get('files', []):
  file_id = child['id']
  request = drive_service.files().get_media(fileId=file_id)
  fh = io.BytesIO()
  downloader = MediaIoBaseDownload(fh, request)
  done = False
  while done is False:
    status, done = downloader.next_chunk()
    print("Download %d%%." % int(status.progress()*100))
  with open(child['name'], 'wb') as x:
    x.write(fh.getvalue())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Download 100%.
Download 100%.
Download 100%.
Download 100%.


In [0]:
def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g.
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens

In [0]:
def get_sequence(data, seq_length, vocab_dict):
    '''
    :param data: a list of words, type: list
    :param seq_length: the length of sequences,, type: int
    :param vocab_dict: a dict from words to indices, type: dict
    return a dense sequence matrix whose elements are indices of words,
    '''
    data_matrix = np.zeros((len(data), seq_length), dtype=int)
    for i, doc in enumerate(data):
        for j, word in enumerate(doc):
            # YOUR CODE HERE
            if j == seq_length:
                break
            word_idx = vocab_dict.get(word, 1) # 1 means the unknown word
            data_matrix[i, j] = word_idx
    return data_matrix


In [0]:
def read_data(file_name, input_length, vocab=None):
    """
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name, engine='python')
    df['words'] = df['text'].apply(tokenize)

    if vocab is None:
        vocab = set()
        for i in range(len(df)):
            for word in df.iloc[i]['words']:
                vocab.add(word)
    vocab_dict = dict()
    vocab_dict['<pad>'] = 0 # 0 means the padding signal
    vocab_dict['<unk>'] = 1 # 1 means the unknown word
    vocab_size = 2
    for v in vocab:
        vocab_dict[v] = vocab_size
        vocab_size += 1

    data_matrix = get_sequence(df['words'], input_length, vocab_dict)
    stars = df['stars'].apply(int) - 1
    
    
    cool = np.array(preprocessing.minmax_scale(df['cool'].tolist()))
    funny = np.array(preprocessing.minmax_scale(df['funny'].tolist()))
    useful = np.array(preprocessing.minmax_scale(df['useful'].tolist()))
    
    features = np.vstack([cool, funny, useful]).T

    return df['review_id'], stars, data_matrix, vocab, features

In [0]:
input_length = 300
embedding_size = 300
hidden_size = 100
batch_size = 100
dropout_rate = 0.5
learning_rate = 0.1
total_epoch = 10

filters = 150
padding = 'valid'
activation = 'relu'
strides = 1
pool_size = 2
kernel_sizes = [3, 4, 5]

In [0]:
# Load training data and vocab
train_id_list, train_data_label, train_data_matrix, vocab, train_features = read_data("train.csv", input_length)
K = max(train_data_label)+1  # labels begin with 0

In [0]:
valid_id_list, valid_data_label, valid_data_matrix, vocab, valid_features = read_data("valid.csv", input_length, vocab=vocab)

In [0]:
test_id_list, _, test_data_matrix, _, test_features = read_data("test.csv", input_length, vocab=vocab)

In [0]:
print("Vocabulary Size:", len(vocab))
print("Training Set Size:", len(train_id_list))
print("Validation Set Size:", len(valid_id_list))
print("Test Set Size:", len(test_id_list))
print("Training Set Shape:", train_data_matrix.shape)
print("Validation Set Shape:", valid_data_matrix.shape)
print("Testing Set Shape:", test_data_matrix.shape)
print("Training Features:", train_features.shape)
print("Valid Features:", valid_features.shape)

In [0]:
train_data_label = keras.utils.to_categorical(train_data_label, num_classes=K)
valid_data_label = keras.utils.to_categorical(valid_data_label, num_classes=K)

In [0]:
N = train_data_matrix.shape[0]
K = train_data_label.shape[1]

input_size = len(vocab) + 2
output_size = K

In [0]:
main_input = Input(shape=(input_length,), name='main_input')

e = Embedding(input_dim=input_size, output_dim=embedding_size, input_length=input_length)(main_input)

e_d = Dropout(dropout_rate)(e)

e_d = Reshape((input_length, embedding_size//3, 3))(e_d)



In [0]:
conv_blocks = []
for kernel_size in kernel_sizes:
        
    conv = Conv2D(filters=filters, kernel_size=(kernel_size, embedding_size//6), padding=padding, activation=activation, strides=(strides, strides))(e_d)
    maxpooling = MaxPool2D(pool_size=((input_length-kernel_size)//strides+1, 1))(conv)
    flatten = Flatten()(maxpooling)
    conv_blocks.append(flatten)
    
c = Concatenate()(conv_blocks) if len(kernel_sizes) > 1 else conv_blocks[0]
c_d = Dropout(dropout_rate)(c)

In [0]:
lstm_out = CuDNNLSTM(units=hidden_size)(c_d)

In [0]:
auxiliary_input = Input(shape=(3,), name='features')

In [0]:
x = keras.layers.concatenate([c_d, auxiliary_input])

main_output = Dense(K, activation='softmax', name='main_output')(x)

In [0]:
model = Model(inputs=[main_input, auxiliary_input], outputs=[main_output])

In [0]:
optimizer = SGD(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [0]:
model.fit({'main_input':train_data_matrix, 'features':train_features}, {'main_output': train_data_label}, epochs=total_epoch, batch_size=batch_size)

In [0]:
import datetime
datetime.datetime.strptime('28/10/2017  2:42:40 PM', '%d/%m/%Y %I:%M:%S %p').strftime('%A')

In [0]:
valid_score = model.evaluate({'main_input':valid_data_matrix, 'features':valid_features}, {'main_output': valid_data_label}, batch_size=batch_size)
print('Validation Loss: {}\n Validation Accuracy: {}\n'.format(valid_score[0], valid_score[1]))