In [2]:
from keras.models import Sequential  
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Flatten
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import numpy as np

In [4]:
# Load data

def load_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return np.asarray(vocabulary)

pos_related = load_list('./sentence_data/pos_related.txt')
neg_related = load_list('./sentence_data/neg_related.txt')
pos_unrelated = load_list('./sentence_data/pos_unrelated.txt')
neg_unrelated = load_list('./sentence_data/neg_unrelated.txt')

In [5]:
print(pos_related.shape)
print(neg_related.shape)
print(pos_unrelated.shape)
print(neg_unrelated.shape)

(466,)
(83,)
(388,)
(34,)


In [13]:
# Stack the data

related_set = np.hstack((pos_related, neg_related))
print(related_set.shape)
y_related = np.ones(related_set.shape)
print(y_related.shape)

unrelated_set = np.hstack((pos_unrelated, neg_unrelated))
print(unrelated_set.shape)
y_unrelated = np.zeros(unrelated_set.shape)
print(y_unrelated.shape)

X_stack = np.hstack((related_set, unrelated_set))
y_stack = np.hstack((y_related, y_unrelated))

X_stack.shape

(549,)
(549,)
(422,)
(422,)


(971,)

'not perfect by a long shot, but definitely good for a smile on a bad day.'

In [103]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=False, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# X_vectorized = tf_vectorizer.fit_transform(X_stack)

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_stack, test_size=0.33, random_state=42)

tf_vectorizer.fit(X_stack)

word_dict = tf_vectorizer.vocabulary_

In [104]:
word_dict

{'not': 2244,
 'perfect': 2393,
 'by': 506,
 'a': 72,
 'long': 1919,
 'shot': 2901,
 'but': 502,
 'definitely': 851,
 'good': 1408,
 'for': 1293,
 'smile': 2967,
 'on': 2295,
 'bad': 318,
 'day': 825,
 'the': 3249,
 'whole': 3591,
 'cast': 548,
 'was': 3541,
 'great': 1430,
 'each': 1012,
 'character': 579,
 'had': 1463,
 'their': 3252,
 'own': 2337,
 'personality': 2408,
 'and': 199,
 'charm': 583,
 'even': 1120,
 'though': 3279,
 'it': 1725,
 'has': 1492,
 'one': 2297,
 'of': 2277,
 'standard': 3059,
 'revenge': 2716,
 'price': 2515,
 'plot': 2455,
 'this': 3271,
 'film': 1241,
 'is': 1721,
 'my': 2168,
 'favorite': 1208,
 'vincent': 3501,
 "price's": 2516,
 'work': 3629,
 'i': 1615,
 'really': 2628,
 'enjoyed': 1076,
 'movie': 2144,
 'fun': 1336,
 'to': 3309,
 'watch': 3546,
 'get': 1374,
 'elvira': 1050,
 'into': 1702,
 'all': 169,
 'these': 3260,
 'adventure': 134,
 'she': 2884,
 'just': 1772,
 'with': 3614,
 'more': 2132,
 'laugh': 1845,
 'than': 3242,
 'any': 218,
 'other': 2321

In [120]:
from textblob import TextBlob
import re

token_pattern = re.compile(token)
X = []
i=0
for sentence in X_stack[:50]:
    split = token_pattern.findall(sentence)
#     mat = tf_vectorizer.transform(split.words).todense()
    seq = []
    print(split)
    for word in split:
        seq.append(word_dict[word])
#     print(seq)
    X.append(seq)
    print(i)
    i=i+1
    
X = np.asarray(X)

print(X.shape)

['not', 'perfect', 'by', 'a', 'long', 'shot', 'but', 'definitely', 'good', 'for', 'a', 'smile', 'on', 'a', 'bad', 'day']
0
['the', 'whole', 'cast', 'was', 'great', 'each', 'character', 'had', 'their', 'own', 'personality', 'and', 'charm']
1
['even', 'though', 'it', 'has', 'one', 'of', 'the', 'standard', 'revenge', 'price', 'plot', 'this', 'film', 'is', 'my', 'favorite', 'of', 'vincent', "price's", 'work']
2
['i', 'really', 'enjoyed', 'this', 'movie', 'it', 'is', 'really', 'fun', 'to', 'watch', 'get', 'elvira', 'into', 'all', 'these', 'adventure', 'she', 'is', 'just', 'great']
3
['with', 'more', 'laugh', 'than', 'any', 'other', 'third', 'in', 'a', 'disney', 'series', 'movie', 'hakuna', 'matata', 'is', 'worth', 'watching', 'if', 'only', 'for', 'the', 'hot', 'tub', 'scene', 'which', 'is', 'still', 'funny', 'despite', 'being', 'a', 'little', 'bit', 'predictable']
4
['it', 'is', 'really', 'a', 'wonderful', 'thriller', 'i', 'enjoyed', 'very', 'much']
5
['when', 'my', 'sister', 'said', 'this'

KeyError: 'Â'

In [121]:
X_stack[48]

'after some of the negative review i heard on this movie, i was doubtful of giving it a go, but i had Â£3.99 in my wallet and thought id gamble on buying a budget like movie and saw this and gave it ago and i am glad i did, i enjoyed it.'

In [122]:
y_stack[48]

1.0

In [88]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_stack, test_size=0.33, random_state=42)

In [89]:
print('Pad sequences (samples x time)')
maxlen=100
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

Pad sequences (samples x time)
x_train shape: (650,)
x_test shape: (321,)


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [8]:
voc = len(tf_vectorizer.get_feature_names())

In [9]:
# The X should be 3d matrix, each instance is a 2d. 
# We are only using one hot encoder at each time t. 
# It also could be done using embedding or word2vec. 

# e = Embedding(voc, 32, input_length=50)

In [10]:
max_length = 0
for i in X_stack:
    token = i.split(" ")
    if max_length < len(token):
        max_length = len(token)
print(max_length)

109


In [11]:
in_neurons = voc 

hidden_neurons = 10

In [19]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_stack, y_stack, test_size=0.33, random_state=42)

from keras.preprocessing import sequence
maxlen=10
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

Pad sequences (samples x time)


TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [12]:
# define the model
model = Sequential()
# model.add(Embedding(voc, 32, input_length=max_length))
# model.add(Flatten())
model.add(LSTM(output_dim=hidden_neurons, input_dim=in_neurons))

model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'], class_mode="binary")
# summarize the model
print(model.summary())

  """
  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10)                147840    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 147,851
Trainable params: 147,851
Non-trainable params: 0
_________________________________________________________________
None
