In [48]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from os import listdir
import sys
from collections import Counter
import string
from numpy import array
import numpy as np

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D
from matplotlib import pyplot

from gensim.models import Word2Vec

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [41]:
def load_doc(file):
    with open(file) as f:
        text = f.read()
        return text

def clean_doc(text):
    # text = text.lower()
    token = text.split(" ")

    #remove punctuations
    table = str.maketrans('','',string.punctuation)
    token = [word.translate(table) for word in token]

    stop_words = set(stopwords.words('english'))
    #remove stopwords and non-alphabet character
    # token = [word for word in token if word.isalpha() and word not in stop_words ]
    token = [word for word in token if word.isalpha() and word not in stop_words]
    # token = [word for word in token if word not in stop_words]


    return token

def generate_vocab(directory, is_train = True):
    vocab = Counter()
    for file in listdir(directory):
        if is_train and file.startswith("cv9"):
            continue
        if not is_train and not file.startswith("cv9"):
            continue
        path = "/".join([directory, file])
        doc = load_doc(path)
        tokens = clean_doc(doc)
        vocab.update(tokens)
    return vocab

def run():
    vocab1 = generate_vocab('./datasets/movie_review/txt_sentoken/neg')
    vocab2 = generate_vocab('./datasets/movie_review/txt_sentoken/pos')

    vocab = vocab1 + vocab2

    print('Most common words ', vocab.most_common(10))

    #Produce total tokens, ignore tokesn with count < 2
    tokens = [word for word,count in vocab.items() if count >= 2]
    print(len(tokens))

    #save tokens to a file

    with open('vocab.txt', 'w') as file:
        data = "\n".join(tokens)
        file.write(data)  
# run()


### Train embedding layer

In [42]:
def load_dataset(dataset, label, is_training = True):
    x = list()
    y = list()
    for file in listdir(dataset):
        if is_training and file.startswith("cv9"):
            continue
        if is_training == False and file.startswith("cv9") == False:
            continue
        path = '/'.join([dataset, file])
        text = load_doc(path)

        tokens = clean_doc(text)

        x.append(tokens)
        y.append(label)
    
    return x, y

vocab = load_doc('vocab.txt')

vocab = vocab.split()
vocab = set(vocab)

positives, y_pos = load_dataset('./datasets/movie_review/txt_sentoken/pos', 0)
negatives, y_neg = load_dataset('./datasets/movie_review/txt_sentoken/neg', 1)

test_positives, test_y_pos = load_dataset('./datasets/movie_review/txt_sentoken/pos', 0, False)
test_negatives, test_y_neg = load_dataset('./datasets/movie_review/txt_sentoken/neg', 1, False)

training_dataset = positives + negatives
training_labels = y_pos + y_neg
print('training dataset: ', len(training_dataset))

test_dataset = test_positives + test_negatives
test_labels = test_y_pos + test_y_neg
print('test dataset: ', len(test_dataset))


training dataset:  1800
test dataset:  200


In [44]:
print(training_dataset[0])

['films', 'adapted', 'comic', 'books', 'plenty', 'success', 'whether', 'theyre', 'superheroes', 'batman', 'superman', 'spawn', 'geared', 'toward', 'kids', 'casper', 'arthouse', 'crowd', 'ghost', 'world', 'theres', 'never', 'really', 'comic', 'book', 'like', 'hell', 'starters', 'created', 'alan', 'moore', 'eddie', 'campbell', 'brought', 'medium', 'whole', 'new', 'level', 'mid', 'series', 'called', 'watchmen', 'say', 'moore', 'campbell', 'thoroughly', 'researched', 'subject', 'jack', 'ripper', 'would', 'like', 'saying', 'michael', 'jackson', 'starting', 'look', 'little', 'odd', 'book', 'graphic', 'novel', 'pages', 'long', 'includes', 'nearly', 'consist', 'nothing', 'footnotes', 'words', 'dont', 'dismiss', 'film', 'source', 'get', 'past', 'whole', 'comic', 'book', 'thing', 'might', 'find', 'another', 'stumbling', 'block', 'hells', 'directors', 'albert', 'allen', 'hughes', 'hughes', 'brothers', 'direct', 'seems', 'almost', 'ludicrous', 'casting', 'carrot', 'top', 'well', 'anything', 'riddl

In [43]:

def process_features(features, labels, tokenizer, maxlen, is_train = True):
    if is_train:
        tokenizer.fit_on_texts(features)
    encoded_dataset = tokenizer.texts_to_sequences(features)

    #pad sequences to the max length sequence
    x_train = pad_sequences(encoded_dataset, maxlen=maxlen, padding="post")
    y_train = array(labels)
    return x_train, y_train

tokenizer = Tokenizer()
max_len_token = max([len(s) for s in training_dataset]) 
print(max_len_token)
x_train, y_train = process_features(training_dataset, training_labels, tokenizer, max_len_token)
print('training size: ', len(x_train))
print('training feature sample: ', x_train[0])
print('training label sample:', y_train[0])

x_test, y_test = process_features(test_dataset, test_labels, tokenizer, max_len_token, is_train=False)
print('test size: ', len(x_test))
print('test feature sample: ', x_test[0])
print('test label sample:', y_test[0])


1354
training size:  1800
training feature sample:  [   9 2952  238 ...    0    0    0]
training label sample: 0
test size:  200
test feature sample:  [ 449  159 9140 ...    0    0    0]
test label sample: 0


### Define a model

In [67]:

def summarize_diagnostic(history, path = ''):
    pyplot.figure(figsize=(8,8))
    pyplot.figtext(0.5, 1, path, ha='center', fontsize=18)
    pyplot.subplot(211)
    pyplot.title("Loss")
    pyplot.plot(history.history['loss'], color="blue", label="train loss")
    pyplot.plot(history.history['val_loss'], color="red", label="validation loss")
    pyplot.legend()
    pyplot.subplot(212)
    pyplot.title("Accuracy")
    pyplot.plot(history.history['accuracy'], color="blue", label="train accuracy")
    pyplot.plot(history.history['val_accuracy'], color="red", label="validation accuracy")
    pyplot.legend()
    if(path!= ''):
        pyplot.savefig(path)

def compile_model(model):
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def basic_model(vocab_size, max_length_token):
    model = Sequential()
    model.add(Embedding(vocab_size, 100, input_length=max_length_token))
    model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

def training_routine(model, args):
    return model.fit(
        args['x'],
        args['y'],
        epochs=args['epoch'],
        verbose = args['verbose']
    )

model = basic_model(len(tokenizer.word_index) + 1, max_len_token)
model.summary()
model = compile_model(model)

history = training_routine(model, {
    'x': x_train,
    'y': y_train,
    'epoch': 10,
    'verbose': 1
})

# summarize_diagnostic(history)

loss, acc = model.evaluate(x_test, y_test)
print('Test accuracy ', acc, '. Test loss ', loss)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1354, 100)         4390800   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 1347, 32)          25632     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 673, 32)           0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 21536)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 100)               2153700   
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 101       
Total params: 6,570,233
Trainable params: 6,570,233
Non-trainable params: 0
____________________________________________

### Train embedding layer with word2vec

In [35]:
def process_doc_for_w2v(doc, vocab):
    sentences = list()
    for line in doc.splitlines():
        table  = str.maketrans('', '', string.punctuation)
        line = line.translate(table)
        tokens = line.split()
        tokens = [word for word in tokens if word in vocab]
        sentences.append(tokens)
    return sentences

def load_dataset_w2v(directory, vocab, is_train = True):
    features = list()
    for file in listdir(directory):
        if is_train and file.startswith('cv9'):
            continue
        if is_train == False and not file.startswith('cv9'):
            continue
        with open('/'.join([directory, file])) as f:
            doc = f.read()
            sentences = process_doc_for_w2v(doc, vocab) 
            features += sentences
    return features 

#Load the word2vec file 
def load_embedding(filename):
    embedding = dict() 
    with open(filename) as file:
        lines = file.readlines()[1:] # ignore the first line, which is 25435, 100 ( vocab_size, vector_size)
        for line in lines:
            parts = line.split(" ")
            embedding[parts[0]] = array(parts[1:], dtype='float32')
    return embedding

def embedding_to_weight_matrix(embedding, vocab):
    # vocab size + 1 for unknown words
    vocab_size = len(vocab) + 1

    matrix = np.zeros((vocab_size, 100)) 

    for word, index in vocab.items():
        matrix[index] = embedding.get(word)

    return matrix
    
#Load the vocab
vocab = load_doc('vocab.txt')
vocab = set(vocab.split())
print('vocab size = ', len(vocab))

pos_train = load_dataset_w2v('./datasets/movie_review/txt_sentoken/pos', vocab)
neg_train = load_dataset_w2v('./datasets/movie_review/txt_sentoken/neg', vocab)

sentences = pos_train + neg_train
print("training sentences: ", len(x_train))


vocab size =  25435
training sentences:  58109


In [36]:
w2v = Word2Vec(x_train, vector_size=100, window=5, workers=8, min_count=1)
words = list(w2v.wv.key_to_index)
print('Total number of words ',len(words))
filename = 'word_2_vec.txt'
w2v.wv.save_word2vec_format(filename, binary=False)

Total number of words  25435


In [44]:
embedding = load_embedding('word_2_vec.txt')
weight_matrix = embedding_to_weight_matrix(embedding, tokenizer.word_index)
print(weight_matrix)


[[ 0.          0.          0.         ...  0.          0.
   0.        ]
 [-0.67745185  0.434652    0.17125511 ... -0.70728588  0.86512297
  -0.0304953 ]
 [-0.69186592  0.69287962  0.20050693 ... -0.83230758  0.37071192
   0.03749175]
 ...
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]
 [        nan         nan         nan ...         nan         nan
          nan]]


In [50]:
def get_embedding_w2v_model(weight_matrix):
    model = Sequential()
    model.add(Embedding(len(vocab) + 1, 100, weights=[weight_matrix], input_length=max_len_token, trainable=False ))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

model = get_embedding_w2v_model(weight_matrix)
history = training_routine(model, {
    'x': x_train,
    'y': y_train,
    'epoch': 10,
    'verbose': 1
})

# summarize_diagnostic(history)

loss, acc = model.evaluate(x_test, y_test)
print('Test accuracy ', acc, '. Test loss ', loss)

ValueError: Layer weight shape (25436, 100) not compatible with provided weight shape (43908, 100)