# Download Data

In [None]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/EfREjZqiZTlPqhqUPICBbPABdlgPumlaUVxPncm-_9aWIw?download=1 -O "Project 1 - data.zip"
!unzip -q "Project 1 - data.zip"

# Import Libraries

In [None]:
!pip -q install keras-layer-normalization

  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone


In [None]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, BatchNormalization,\
    Activation, Input, Add, Concatenate, Embedding, Conv1D, MaxPool1D,\
    Flatten, LSTM, Bidirectional, MaxPooling1D, SimpleRNN, GRU, SpatialDropout1D
from keras_layer_normalization import LayerNormalization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

# Data Loader

In [None]:
def load_data(split_name='train', columns=['text', 'stars']):
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        df = df.loc[:,columns]
        print("succeed!")
        return df
    except:
        print("Failed, then try to ")
        print(f"select all columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        return df

In [None]:
train_df = load_data('train', columns=['full'])
valid_df = load_data('valid', columns=['full'])
test_df = load_data('test', columns=['full'])

select [full] columns from the train split
Failed, then try to 
select all columns from the train split
select [full] columns from the valid split
Failed, then try to 
select all columns from the valid split
select [full] columns from the test split
Failed, then try to 
select all columns from the test split


# Feature Extractor

In [None]:
def tokenize(text):
    """
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    e.g.
    Input: 'Text mining is to identify useful information.'
    Output: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    """
    return nltk.word_tokenize(text)

def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """

    return [ps.stem(token).lower() for token in tokens]

# Just for testing, was not used in tutorial - removing stopwords doesn't help much
def filter_stopwords(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of filtered tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    Output: ['text', 'mine', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     if token not in stopwords and not token.isnumeric():
    #         results.append(token)
    # return results

    return [token for token in tokens if token not in stopwords and not token.isnumeric()]

In [None]:
def get_feats_dict(feats, min_freq=-1, max_freq=-1, max_size=-1):
    """
    :param data: a list of features, type: list(list)
    :param min_freq: the lowest fequency that the fequency of a feature smaller than it will be filtered out, type: int
    :param max_freq: the highest fequency that the fequency of a feature larger than it will be filtered out, type: int
    :param max_size: the max size of feature dict, type: int
    return a feature dict that maps features to indices, sorted by frequencies
    # Counter document: https://docs.python.org/3.6/library/collections.html#collections.Counter
    """
    # count all features
    feat_cnt = Counter(feats) # ["text", "text", "mine"] --> {"text": 2, "mine": 1}
    if max_size > 0 and min_freq == -1 and max_freq == -1:
        valid_feats = ["<pad>", "<unk>"] + [f for f, cnt in feat_cnt.most_common(max_size-2)]
    else:
        valid_feats = ["<pad>", "<unk>"]
        for f, cnt in feat_cnt.most_common():
            if (min_freq == -1 or cnt >= min_freq) and \
                (max_freq == -1 or cnt <= max_freq):
                valid_feats.append(f)
    if max_size > 0 and len(valid_feats) > max_size:
        valid_feats = valid_feats[:max_size]
    print("Size of features:", len(valid_feats))
    
    # build a mapping from features to indices
    feats_dict = dict(zip(valid_feats, range(len(valid_feats))))
    return feats_dict

def get_index_vector(feats, feats_dict, max_len):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    :param feats: a list of features, type: list
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(max_len, dtype=np.int64)
    for i, f in enumerate(feats):
        if i == max_len:
            break
        # get the feature index, return 1 (<unk>) if the feature is not existed
        f_idx = feats_dict.get(f, 1)
        vector[i] = f_idx
    return vector

# Create Input Features

In [None]:
min_freq = 3

# load data
train_texts, train_labels = train_df["text"], train_df["stars"]
valid_texts, valid_labels = valid_df["text"], valid_df["stars"]

# extract features
train_tokens = [tokenize(text) for text in train_texts]
valid_tokens = [tokenize(text) for text in valid_texts]


train_stemmed = [stem(tokens) for tokens in train_tokens]
valid_stemmed = [stem(tokens) for tokens in valid_tokens]

# If stopwords not used
train_feats = train_stemmed
valid_feats = valid_stemmed

# filtering stopwords didn't help much
# train_feats = [filter_stopwords(tokens) for tokens in train_stemmed]
# valid_feats = [filter_stopwords(tokens) for tokens in valid_stemmed]

# build a mapping from features to indices
feats_dict = get_feats_dict(chain.from_iterable(train_feats), min_freq=min_freq)

Size of features: 9357


In [None]:
max_len = 100

# build the feats_matrix
# convert each example to a index vector, and then stack vectors as a matrix
train_feats_matrix = np.vstack(
    [get_index_vector(f, feats_dict, max_len) for f in train_feats])
valid_feats_matrix = np.vstack(
    [get_index_vector(f, feats_dict, max_len) for f in valid_feats])

# convert labels to label_matrix
num_classes = max(train_labels)
# convert each label to a ont-hot vector, and then stack vectors as a matrix
train_label_matrix = keras.utils.to_categorical(train_labels-1, num_classes=num_classes)
valid_label_matrix = keras.utils.to_categorical(valid_labels-1, num_classes=num_classes)

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=100)
text_ds = tf.data.Dataset.from_tensor_slices(train_texts).batch(128)
vectorizer.adapt(text_ds)

In [None]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'the', 'and', 'i']

In [None]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   2, 2116,  465,   21,    2, 7576])

In [None]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [None]:
test = ["the", "cat", "sat", "on", "the", "mat"]
[word_index[w] for w in test]

[2, 2116, 465, 21, 2, 7576]

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-04-03 06:47:10--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-04-03 06:47:10--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-04-03 06:47:10--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [None]:
path_to_glove_file = os.path.join(
    "glove.6B.100d.txt"
)

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [None]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        # print(word)
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 17209 words (2791 misses)


In [None]:
from tensorflow.keras.layers import Embedding

embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

In [None]:
x_train = vectorizer(np.array([[s] for s in train_texts])).numpy()
x_valid = vectorizer(np.array([[s] for s in valid_texts])).numpy()

In [None]:
x_train.shape, train_label_matrix.shape

((10000, 100), (10000, 5))

In [None]:
from tensorflow.keras import layers

# int_sequences_input = keras.Input(shape=(None,), dtype="int64")
# embedded_sequences = embedding_layer(int_sequences_input)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(embedded_sequences)
# x = layers.MaxPooling1D(5)(x)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(x)
# x = layers.MaxPooling1D(5)(x)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(x)
# x = layers.GlobalMaxPooling1D()(x)
# x = layers.Dense(128, activation="relu")(x)
# x = layers.Dropout(0.5)(x)
# preds = layers.Dense(num_classes, activation="softmax")(x)
# model = keras.Model(int_sequences_input, preds)
# model.summary()

int_sequences_input = keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
x = layers.SpatialDropout1D(0.2)(embedded_sequences)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(embedded_sequences)
# x = layers.MaxPooling1D(5)(x)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(x)
# x = layers.MaxPooling1D(5)(x)
# x = layers.Conv1D(128, 5, activation="relu", padding="same")(x)
x = Bidirectional(LSTM(200))(x)
x = layers.Dropout(0.2)(x)
# x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.2)(x)
preds = layers.Dense(num_classes, activation="softmax")(x)
model = keras.Model(int_sequences_input, preds)
model.summary()

Model: "model_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_34 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_11 (Embedding)     (None, None, 100)         2000200   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, None, 100)         0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 400)               481600    
_________________________________________________________________
dropout_35 (Dropout)         (None, 400)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 128)               51328     
_________________________________________________________________
dropout_36 (Dropout)         (None, 128)               0  

In [None]:
model.compile(
    loss="categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
)

model.fit(x_train, train_label_matrix, batch_size=128, epochs=20, validation_split=0.2, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fb40f7985d0>

# LSTM + CNN

In [None]:
input_length = max_len
vocab_size = len(voc) + 2
embedding_size = 100
hidden_size = 100
num_filters = 100
kernel_size = 2
strides = 1
output_size = num_classes
dropout_rate = 0.5
recurrent_dropout_rate = 0.5

In [None]:
x = Input(shape=(input_length,))

# emb = Embedding(input_dim=vocab_size,
#                         output_dim=embedding_size,
#                         input_length=input_length,
#                         embeddings_initializer=keras.initializers.Constant(embedding_matrix),
#                         trainable=True)(x)

emb = Embedding(
    vocab_size,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)(x)

emb = Dropout(dropout_rate)(emb)

rec = Bidirectional(LSTM(hidden_size,
                    kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                    recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                    return_sequences=True, dropout=dropout_rate, recurrent_activation="sigmoid"))(emb)

h = Concatenate()([emb, rec])

conv = Conv1D(filters=num_filters, kernel_size=kernel_size, padding="valid", strides=strides, activation="relu")(h)

# conv = Activation("tanh")(conv)
    
maxpool = MaxPool1D(pool_size=(input_length-kernel_size)//strides+1)(conv)
maxpool = Flatten()(maxpool)

# maxpool = layers.GlobalMaxPooling1D()(conv)

y = Dense(output_size,
          activation="softmax",
          kernel_initializer=keras.initializers.he_normal(seed=0),
          bias_initializer="zeros")(maxpool)

model = Model(x, y)
# optimizer = keras.optimizers.SGD(lr=learning_rate)
model.compile(loss="categorical_crossentropy", optimizer="Adam", metrics=["accuracy"])
model.summary()

Model: "model_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 100, 100)     2000200     input_37[0][0]                   
__________________________________________________________________________________________________
dropout_39 (Dropout)            (None, 100, 100)     0           embedding_14[0][0]               
__________________________________________________________________________________________________
bidirectional_13 (Bidirectional (None, 100, 200)     160800      dropout_39[0][0]                 
___________________________________________________________________________________________

In [None]:
model.fit(x_train, train_label_matrix,
          validation_split=0.1,
          epochs=30, batch_size=128, verbose=1, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7fb40b66ef90>

In [None]:
train_score = model.evaluate(x_train, train_label_matrix,
                             batch_size=128)
valid_score = model.evaluate(x_valid, valid_label_matrix,
                            batch_size=128)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.36045441031455994 training accuracy 0.8910999894142151
valid loss: 1.2000067234039307 valid accuracy 0.5634999871253967
