In [2]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from os import listdir
import sys
from collections import Counter
import string
from numpy import array
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from matplotlib import pyplot

from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def load_doc(file):
    with open(file) as f:
        text = f.read()
        return text

def clean_doc(text):
    # text = text.lower()
    token = text.split(" ")

    #remove punctuations
    table = str.maketrans('','',string.punctuation)
    token = [word.translate(table) for word in token]

    stop_words = set(stopwords.words('english'))
    #remove stopwords and non-alphabet character
    # token = [word for word in token if word.isalpha() and word not in stop_words ]
    token = [word for word in token if word.isalpha() and word not in stop_words]
    # token = [word for word in token if word not in stop_words]


    return token

def generate_vocab(directory, is_train = True):
    vocab = Counter()
    for file in listdir(directory):
        if is_train and file.startswith("cv9"):
            continue
        if not is_train and not file.startswith("cv9"):
            continue
        path = "/".join([directory, file])
        doc = load_doc(path)
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab

#Load the word1vec file 
def load_embedding(filename, is_glove = False):
    embedding = dict() 
    with open(filename) as file:
        if is_glove:
            lines = file.readlines()#glove format doesn't have header file so take all lines
        else:
            lines = file.readlines()[1:]# ignore the first line, which is 25435, 100 ( vocab_size, vector_size)
        for line in lines:
            parts = line.split(" ")
            embedding[parts[0]] = array(parts[1:], dtype='float32')
    return embedding

def embedding_to_weight_matrix(embedding, vocab, vector_size = 100):

    # vocab size + 0 for unknown words
    vocab_size = len(vocab) + 1 

    matrix = np.zeros((vocab_size, vector_size)) 

    for word, index in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            matrix[index] = vector 

    return matrix

def process_features(features, labels, tokenizer, maxlen, is_train = True):
    if is_train:
        tokenizer.fit_on_texts(features)
    encoded_dataset = tokenizer.texts_to_sequences(features)

    #pad sequences to the max length sequence
    x_train = pad_sequences(encoded_dataset, maxlen=maxlen, padding="post")
    y_train = array(labels)
    return x_train, y_train

def run():
    vocab1 = generate_vocab('./datasets/movie_review/txt_sentoken/neg')
    vocab2 = generate_vocab('./datasets/movie_review/txt_sentoken/pos')

    vocab = vocab1 + vocab2

    print('Most common words ', vocab.most_common(10))

    #Produce total tokens, ignore tokesn with count < 2
    tokens = [word for word,count in vocab.items() if count >= 2]
    print(len(tokens))

    #save tokens to a file

    with open('vocab.txt', 'w') as file:
        data = "\n".join(tokens)
        file.write(data)  
# run()


### Train embedding layer

In [4]:
def load_dataset(dataset, label, vocab, is_training = True):
    x = list()
    y = list()
    for file in listdir(dataset):
        if is_training and file.startswith("cv9"):
            continue
        if is_training == False and file.startswith("cv9") == False:
            continue
        path = '/'.join([dataset, file])
        text = load_doc(path)

        tokens = clean_doc(text)
        tokens = [word for word in tokens if word in vocab ]

        x.append(tokens)
        y.append(label)
    
    return x, y

vocab = load_doc('vocab.txt')

vocab = vocab.split()
vocab = set(vocab)

positives, y_pos = load_dataset('./datasets/movie_review/txt_sentoken/pos', 0, vocab)
negatives, y_neg = load_dataset('./datasets/movie_review/txt_sentoken/neg', 1, vocab)

test_positives, test_y_pos = load_dataset('./datasets/movie_review/txt_sentoken/pos', 0, vocab, False)
test_negatives, test_y_neg = load_dataset('./datasets/movie_review/txt_sentoken/neg', 1, vocab, False)

training_dataset = positives + negatives
training_labels = y_pos + y_neg
print('training dataset: ', len(training_dataset))

test_dataset = test_positives + test_negatives
test_labels = test_y_pos + test_y_neg
print('test dataset: ', len(test_dataset))


training dataset:  1800
test dataset:  200


In [5]:
tokenizer = Tokenizer()
max_len_token = max([len(s) for s in training_dataset]) 
print(max_len_token)
x_train, y_train = process_features(training_dataset, training_labels, tokenizer, max_len_token)
print('training size: ', len(x_train))
print('training feature sample: ', x_train[0])
print('training label sample:', y_train[0])

x_test, y_test = process_features(test_dataset, test_labels, tokenizer, max_len_token, is_train=False)
print('test size: ', len(x_test))
print('test feature sample: ', x_test[0])
print('test label sample:', y_test[0])

1291
training size:  1800
training feature sample:  [   9 2952  238 ...    0    0    0]
training label sample: 0
test size:  200
test feature sample:  [ 449  159 9140 ...    0    0    0]
test label sample: 0


### Define a model

In [6]:
def summarize_diagnostic(history, path = ''):
    pyplot.figure(figsize=(8,8))
    pyplot.figtext(0.5, 1, path, ha='center', fontsize=18)
    pyplot.subplot(211)
    pyplot.title("Loss")
    pyplot.plot(history.history['loss'], color="blue", label="train loss")
    pyplot.plot(history.history['val_loss'], color="red", label="validation loss")
    pyplot.legend()
    pyplot.subplot(212)
    pyplot.title("Accuracy")
    pyplot.plot(history.history['accuracy'], color="blue", label="train accuracy")
    pyplot.plot(history.history['val_accuracy'], color="red", label="validation accuracy")
    pyplot.legend()
    if(path!= ''):
        pyplot.savefig(path)


def compile_model(model):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def training_routine(model, args):
    return model.fit(
        args['x'],
        args['y'],
        epochs=args['epoch'],
        verbose = args['verbose']
    )

In [54]:
def basic_model(vocab_size, max_length_token):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length_token))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

model = basic_model(len(tokenizer.word_index) + 1, max_len_token)
model.summary()
model = compile_model(model)

history = training_routine(model, {
    'x': x_train,
    'y': y_train,
    'epoch': 10,
    'verbose': 1
})

# summarize_diagnostic(history)

loss, acc = model.evaluate(x_test, y_test)
print('Test accuracy ', acc, '. Test loss ', loss)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 1291, 100)         2543600   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1284, 32)          25632     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 642, 32)           0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 20544)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 100)               2054500   
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 4,623,833
Trainable params: 4,623,833
Non-trainable params: 0
____________________________________________

### Train embedding layer with word2vec

In [46]:
print(len(tokenizer.word_index))

25435


In [7]:
#Load trained word 2 vector
embedding = load_embedding('word_2_vec.txt')

weight_matrix = embedding_to_weight_matrix(embedding, tokenizer.word_index )
print(weight_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.45786524  0.54656488  0.05135894 ... -0.76674253  0.59616333
   0.18149421]
 [-0.5772382   0.81545228  0.08127926 ... -0.75325519  0.21177313
   0.22589795]
 ...
 [-0.00179134  0.02266374  0.01850034 ... -0.0237722   0.02090832
   0.02317982]
 [ 0.00094989  0.02554806  0.00635505 ... -0.00704258  0.01889366
   0.01335397]
 [-0.00352503  0.02271733  0.00564215 ... -0.01373077  0.00766795
   0.00602894]]


In [8]:
glove = load_embedding('./datasets/glove.6B.100d.txt', is_glove=True)
glove_weight_matrix = embedding_to_weight_matrix(glove, tokenizer.word_index )
print(glove_weight_matrix)

[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.19915999 -0.049702    0.24579    ... -0.068109    0.017651
   0.06455   ]
 [ 0.38251001  0.14821     0.60601002 ...  0.058921    0.091112
   0.47283   ]
 ...
 [-0.078809   -0.73105001 -0.12292    ... -0.63562    -0.48644999
   0.094265  ]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]
 [ 0.12185    -0.45809001 -0.043794   ...  0.37827    -0.28586999
   0.3994    ]]


In [9]:
def get_embedding_w2v_model(weight_matrix, is_embedding_trainable = False):
    model = Sequential()
    model.add(Embedding(len(vocab) + 1, 100, weights=[weight_matrix], input_length=max_len_token, trainable=is_embedding_trainable ))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

model = get_embedding_w2v_model(weight_matrix)
model = compile_model(model)

history = training_routine(model, {
    'x': x_train,
    'y': y_train,
    'epoch': 10,
    'verbose': 1
})

# summarize_diagnostic(history)

loss, acc = model.evaluate(x_test, y_test)
print('Test accuracy ', acc, '. Test loss ', loss)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy  0.5249999761581421 . Test loss  1.4315271377563477


### use pretrained GloVe

In [12]:
model = get_embedding_w2v_model(glove_weight_matrix, True)

model = compile_model(model)

history = training_routine(model, {
    'x': x_train,
    'y': y_train,
    'epoch': 10,
    'verbose': 1
})

# summarize_diagnostic(history)

loss, acc = model.evaluate(x_test, y_test)
print('Test accuracy ', acc, '. Test loss ', loss)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy  0.800000011920929 . Test loss  0.5300505757331848
