In [1]:
import glob
import os
from random import shuffle
from nltk.tokenize import TreebankWordTokenizer

In [2]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(\
'C:/Users/markn/Artificial_Intelligence/NLP/GoogleNews\
-vectors-negative300.bin.gz', binary=True, limit=1000000)

In [3]:
def pre_process_data(filepath):
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', errors='ignore') as f:
            dataset.append((pos_label, f.read()))
    
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', errors='ignore') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

In [4]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass
        vectorized_data.append(sample_vecs)
    return vectorized_data

In [5]:
def collect_expected(dataset):
    expected= []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [6]:
dataset = pre_process_data('train')

In [7]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_data,\
                                                   expected, test_size=0.2,
                                                   random_state=42)

In [9]:
maxlen = 400
embedding_dims = 300
epochs = 2
batch_size = 32

In [10]:
def pad_trunc(data, maxlen):
    new_data = []
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [None]:
import numpy as np
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
y_train = np.array(y_train)

X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN
num_neurons = 50
model = Sequential()

In [None]:
model.add(SimpleRNN(num_neurons,
                   return_sequences=True,
                   input_shape=(maxlen, embedding_dims)))

In [None]:
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs,
         validation_data=(X_test, y_test), callbacks=[callbacks])

## LSTM

In [None]:
maxlen = 400
embedding_dims = 300
batch_size = 32
epochs = 2
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM
num_neurons = 50
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True,
              input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
import numpy as np

In [None]:
dataset = pre_process_data('train')
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [None]:
maxlen = 400
batch_size = 32
embedding_dims = 300
epochs = 2

In [None]:
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)
X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten, Dropout
num_neurons = 50
model = Sequential()
model.add(LSTM(num_neurons, return_sequences=True,
              input_shape=(maxlen, embedding_dims)))
model.add(Dropout(0.2))
model.add(Flatten())

In [None]:
model.add(Dense(1, activation='sigmoid'))
model.compile('rmsprop', 'binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=epochs, validation_data=\
                   (X_test, y_test))

In [None]:
model.summary()

In [None]:
model_structure = model.to_json()
with open("lstm_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("lstm_weights.h5")

In [None]:
from keras.models import model_from_json
with open("lstm_model.json", 'r') as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

model.load_weights('lstm_weights.h5')

In [None]:
sample1 = """I hate that the dismal weather had me down for so long,
when will it break! Ugh, when does happiness return? The sun is blinding and
the puffy clouds are too thin. I cant wait for the weekend"""

In [None]:
vec_list = tokenize_and_vectorize([(1, sample1)])
test_vec_list = pad_trunc(vec_list, maxlen)
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, 
                                      embedding_dims))

In [None]:
print("Sample's sentiment, 1-pos, 2-neg: {}".\
      format(model.predict_classes(test_vec)))

In [None]:
print("RAW output of the sigmoid function: {}".format(model.predict(test_vec)))

In [None]:
def test_len(data, maxlen):
    total_len = truncated = exact = padded = 0
    for sample in data:
        total_len += len(sample)
        if len(sample) > maxlen:
            truncated += 1
        elif len(sample) < len(sample):
            padded += 1
        else:
             exact += 1
    print("Padded: {}".format(padded))
    print("Equal: {}".format(exact))
    print("Truncated: {}".format(truncated))
    print("Average Length: {}".format(total_len/len(data)))

In [None]:
test_len(vectorized_data, 400)

In [None]:
maxlen = 210