In [1]:
from keras.models import Sequential  
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Flatten
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import numpy as np

In [53]:
# Load data

def load_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return np.asarray(vocabulary)

def load_csv(filename):
    import csv
    
    sentence = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            sentence.append(str(row))
    return np.asarray(sentence).flatten()
    
pos_related = load_csv('./sentence_data/pos_related.csv')
neg_related = load_csv('./sentence_data/neg_related.csv')
pos_unrelated = load_csv('./sentence_data/pos_unrelated.csv')
neg_unrelated = load_csv('./sentence_data/neg_unrelated.csv')

In [54]:
print(pos_related.shape)
print(neg_related.shape)
print(pos_unrelated.shape)
print(neg_unrelated.shape)

(466,)
(198,)
(388,)
(149,)


In [55]:
# Stack the data

related_set = np.hstack((pos_related, neg_related))
print(related_set.shape)
y_related = np.ones(related_set.shape)
print(y_related.shape)

unrelated_set = np.hstack((pos_unrelated, neg_unrelated))
print(unrelated_set.shape)
y_unrelated = np.zeros(unrelated_set.shape)
print(y_unrelated.shape)

X_stack = np.hstack((related_set, unrelated_set))
y_stack = np.hstack((y_related, y_unrelated))

X_stack.shape

(664,)
(664,)
(537,)
(537,)


(1201,)

In [81]:
import numpy as np
from random import sample 

random.seed(42)
param = 0.7
partition = int(len(X_stack) * param)
indices = sample(range(len(X_stack)),partition)

X_train_corpus = np.asarray(X_stack[indices])
X_test_corpus = np.asarray(np.delete(X_stack,indices))
y_train = y_stack[indices]
y_test = np.delete(y_stack, indices)

In [57]:
# import random
# random.seed(42)

# param = 0.7
# train_size = int(param * len(X_stack))
# test_size = len(X_stack) - train_size
# train_indices = random.sample(range(0, len(X_stack)), train_size)
# test_indices = []

# for i in range(0,len(X_stack)):
#     flag = 0
#     for j in train_indices:
#         if i == j:
#             flag = 1
#     if flag == 0:
#         test_indices.append(i)
            

In [58]:
# X_train_corpus = X_stack[train_indices]
# X_test_corpus = X_stack[test_indices]
# y_train = y_stack[train_indices]
# y_test = y_stack[test_indices]

In [59]:
print(len(X_train_corpus))
print(len(X_test_corpus))

840
361


In [60]:
print(X_stack[1000:])

["['what is very french about this film is the time taken to establish the two leading character']"
 "['dear god where do i begin']"
 "['legend of zu is possibly 6hours condensed into 1h40']"
 "['i started watching it about two and a half year ago']"
 "['he views the earth and nature with such admiration and respect that it is primitive in a good sense']"
 "['this is the second movie based on the life and time of ultra hung porn star, john curtis estes, better known as john holme']"
 '[\'when i was six yo, i learned about a series called "los campeone", and even if i was just a kid i did everything i need to convince my pare not to let me watch "the champion" and "the avenger" once every week\']'
 "['how much to russia, how much to britian']"
 "['hispanic and asian in particular seem cursed to playing villain in western and action movie']"
 "['i am a big fan of film where person get conned']"
 "['but good there is also a pause-button']"
 "['i was a junior in high school when this sitco

In [61]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=False, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# X_vectorized = tf_vectorizer.fit_transform(X_stack)

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_stack, test_size=0.33, random_state=42)

tf_vectorizer.fit(X_test_corpus)

word_dict = tf_vectorizer.vocabulary_

In [62]:
word_dict

{'with': 1738,
 'more': 1026,
 'laugh': 894,
 'than': 1553,
 'any': 101,
 'other': 1121,
 'third': 1570,
 'in': 796,
 'a': 29,
 'disney': 451,
 'series': 1378,
 'movie': 1033,
 'hakuna': 712,
 'matata': 979,
 'is': 830,
 'worth': 1753,
 'watching': 1700,
 'if': 788,
 'only': 1112,
 'for': 623,
 'the': 1556,
 'hot': 770,
 'tub': 1618,
 'scene': 1346,
 'which': 1722,
 'still': 1477,
 'funny': 647,
 'despite': 422,
 'being': 177,
 'little': 927,
 'bit': 195,
 'predictable': 1198,
 'it': 832,
 'touched': 1603,
 'me': 984,
 'way': 1701,
 'that': 1554,
 'even': 538,
 'all': 74,
 'these': 1564,
 'year': 1765,
 'later': 893,
 'affects': 60,
 'anyway': 105,
 'story': 1484,
 'line': 922,
 'was': 1695,
 'although': 80,
 'simple': 1406,
 'but': 245,
 'very': 1666,
 'real': 1254,
 'and': 94,
 'touching': 1604,
 'well': 1713,
 'made': 957,
 'stylish': 1499,
 'while': 1723,
 'ultimately': 1626,
 'making': 964,
 'sense': 1374,
 'this': 1572,
 'thriller': 1581,
 'would': 1754,
 'work': 1747,
 'better':

In [67]:
# Make a function call for this (to call the padding sequence for train and test)

def generate_token_sequence(X_corpus):
    from textblob import TextBlob
    import re

    token_pattern = re.compile(token)
    X = []
    i=0
    for sentence in X_corpus:
        split = token_pattern.findall(sentence)
        seq = []
        for word in split:
            try:
                seq.append(word_dict[word])
            except KeyError:
                continue
        X.append(seq)

#     X = np.asarray(X)

#     print(X.shape)
    return np.asarray(X)

In [68]:
X_train = generate_token_sequence(X_train_corpus)
X_test = generate_token_sequence(X_test_corpus)

In [65]:
y_stack[260]

1.0

In [31]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y_stack, test_size=0.33, random_state=42)

In [69]:
from keras.preprocessing.sequence import pad_sequences

print('Pad sequences (samples x time)')
maxlen=100
x_train = pad_sequences(X_train, maxlen=maxlen)
x_test = pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (840, 100)
x_test shape: (361, 100)


In [71]:
dict_len = len(tf_vectorizer.get_feature_names())
batch_size = 32
print(dict_len)

1779


In [72]:
hidden_neurons = 10

In [88]:
print('Build model...')
model = Sequential()
model.add(Embedding(dict_len, 500))
model.add(LSTM(output_dim=hidden_neurons, input_dim=500))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam')   

Build model...


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [89]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 500)         889500    
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                20440     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 11        
Total params: 909,951
Trainable params: 909,951
Non-trainable params: 0
_________________________________________________________________


In [90]:
from keras.callbacks import TensorBoard
from time import time

# tensorboard --logdir=logs/
tensorboard = TensorBoard(log_dir="logs/{}".format(time()), write_graph=True, write_images=True)

In [93]:

print('Train...')
hist = model.fit(x_train, y_train, epochs=5, verbose=1, callbacks=[tensorboard])

Train...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [94]:

y_predict = model.predict(x_test)

y = []

for pred in y_predict:
    if pred > 0.5:
        y.append(1)
    else:
        y.append(0)

        
np.sum(y_test == y)/len(y)

0.7562326869806094

In [80]:
y_test

array(["['with more laugh than any other third-in-a-disney-series movie, hakuna matata is worth watching - if only for the hot tub scene which is still funny despite being a little bit predictable.']",
       "['it touched me in a way that, even all these year later, still affects me.']",
       "['anyway the story line was although simple, but still very real and touching']",
       "['well made and stylish while still ultimately making sense this thriller would work better for non giallo fan to get interested in the genre than the later argento entry which go overboard in all directions']",
       "['since i like that sort of film i enjoyed this']",
       "['i wanted to see it again as soon as it was over.']",
       "['this film hits home as one of the most powerful and emotionally affecting film in recent times']",
       "['more like spinal tap than anything else, the film is clever from the start']",
       "['overall, this is a pretty good movie']",
       "['if anyone loves th

In [12]:
# define the model
model = Sequential()
# model.add(Embedding(voc, 32, input_length=max_length))
# model.add(Flatten())
model.add(LSTM(output_dim=hidden_neurons, input_dim=in_neurons))

model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'], class_mode="binary")
# summarize the model
print(model.summary())

  """
  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10)                147840    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 147,851
Trainable params: 147,851
Non-trainable params: 0
_________________________________________________________________
None
