In [1]:
import glob
import os

from random import shuffle

def pre_process_data(filepath):
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', errors='ignore') as f:
            dataset.append((pos_label, f.read()))
    
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', errors='ignore') as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    
    return dataset

In [2]:
dataset = pre_process_data('train')
dataset[0]

(1,
 'When I first heard about the show, I heard a lot about it, and it was getting some good reviews. I watched the first episode of this "forensic fairy tale", as it so proclaims itself, and I really got hooked on it. I have loved it since. This show has a good sense of humour and it\'s fun to see a good show like this. The cast is excellent as their characters, and I wouldn\'t want to change them in any way.<br /><br />For those unfamiliar with this show, Pushing Daisies centers around a man named Ned (aka The Pie Maker, played by Lee Pace) who discovered a special gift when he was a boy: He could bring the dead back to life with the touch of a finger. He first did so with his dog, Digby. However, there is the catch: If he keeps a dead person alive for more than one minute, someone else dies. He learned this when he brought his mother back to life, and his childhood crush\'s father died in Ned\'s mother\'s place. The other catch is if he touches the person again, they\'re dead again

In [9]:
from gensim.models.keyedvectors import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(\
                'C:/Users/markn/Artificial_Intelligence/NLP/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=2000000)

In [7]:
from nltk.tokenize import TreebankWordTokenizer
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            
            except KeyError:
                pass
        
        vectorized_data.append(sample_vecs)
    return vectorized_data

In [5]:
def collect_expected(dataset):
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [6]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectorized_data, \
                                                   expected, test_size=0.2,\
                                                   random_state=42)

In [11]:
maxlen = 400
batch_size = 32
embedding_dims = 300
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [10]:
def pad_trunc(data, maxlen):
    new_data = []
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    
    for sample in data:
        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [10]:
len(X_train)

20000

In [11]:
import numpy as np
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))

In [12]:
X_train.shape

(20000, 400, 300)

In [13]:
len(X_train)

20000

In [14]:
y_train = np.array(y_train)
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))

In [15]:
y_test = np.array(y_test)

In [16]:
from keras.models import Sequential

Using TensorFlow backend.


In [17]:
print('Build model..')

Build model..


In [18]:
from keras.layers import Conv1D
model2 = Sequential()
model2.add(Conv1D(
    filters,
    kernel_size,
    padding='valid',
    activation='relu',
    strides=1,
    input_shape=(maxlen, embedding_dims)))

In [19]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
Total params: 225,250
Trainable params: 225,250
Non-trainable params: 0
_________________________________________________________________


In [20]:
from keras.layers import GlobalAveragePooling1D

In [21]:
model2.add(GlobalAveragePooling1D())

In [22]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 250)               0         
Total params: 225,250
Trainable params: 225,250
Non-trainable params: 0
_________________________________________________________________


In [23]:
from keras.layers import Dense, Dropout, Activation

In [24]:
model2.add(Dense(hidden_dims))
model2.add(Dropout(0.2))
model2.add(Activation('relu'))

In [25]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
Total params: 288,000
Trainable params: 288,000
Non-trainable params: 0
_________________________________________________________________


In [26]:
model2.add(Dense(1))
model2.add(Activation('sigmoid'))

In [27]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 398, 250)          225250    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 250)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 251       
_________________________________________________________________
activation_2 (Activation)    (None, 1)                

In [29]:
model2.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [30]:
model2.fit(X_train, y_train, 
          batch_size= batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test))

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x28a0a501c48>

In [38]:
model_structure = model2.to_json()
with open("cnn_model.json", 'w') as json_file:
    json_file.write(model_structure)

In [39]:
model2.save_weights("cnn_weights.h5")

In [1]:
import numpy as np
np.random.seed(1337)

In [2]:
from keras.models import model_from_json

Using TensorFlow backend.


In [3]:
with open("cnn_model.json", 'r') as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

In [4]:
model.load_weights('cnn_weights.h5')

In [5]:
sample1 = """I hate that the dismal weather had me down for so long,
when will it break! Ugh, when does happiness return? The sun is blinding
and the puffy clouds are too thin. I can't wait for the weekend."""

In [12]:
vec_list = tokenize_and_vectorize([(1, sample1)])

In [13]:
test_vec_list = pad_trunc(vec_list, maxlen)

In [15]:
len(vec_list)

1

In [19]:
test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen,\
                                      embedding_dims))

In [21]:
model.predict(test_vec)

array([[0.23705384]], dtype=float32)

In [22]:
model.predict_classes(test_vec)

array([[0]])