In [128]:
from keras.models import Sequential  
from keras.layers.core import Dense, Activation
from keras.layers.recurrent import LSTM
from keras.layers import Flatten
from keras.layers import Embedding

In [129]:
import numpy as np

In [205]:
# Load data

def load_list(filename):
    vocabulary = []
    with open(filename, 'r') as f:
        for l in f:
            vocabulary.append(l.strip())
    return np.asarray(vocabulary)

def load_csv(filename):
    import csv
    
    sentence = []
    with open('./sentence_data/pos_related.csv', 'r') as csvfile:
        reader = csv.reader(csvfile)
        for row in reader:
            sentence.append(row)
    return np.asarray(sentence).flatten()
    
pos_related = load_csv('./sentence_data/pos_related.csv')
neg_related = load_csv('./sentence_data/neg_related.csv')
pos_unrelated = load_csv('./sentence_data/pos_unrelated.csv')
neg_unrelated = load_csv('./sentence_data/neg_unrelated.csv')

In [206]:
print(pos_related.shape)
print(neg_related.shape)
print(pos_unrelated.shape)
print(neg_unrelated.shape)

(466,)
(466,)
(466,)
(466,)


In [207]:
# Stack the data

related_set = np.hstack((pos_related, neg_related))
print(related_set.shape)
y_related = np.ones(related_set.shape)
print(y_related.shape)

unrelated_set = np.hstack((pos_unrelated, neg_unrelated))
print(unrelated_set.shape)
y_unrelated = np.zeros(unrelated_set.shape)
print(y_unrelated.shape)

X_stack = np.hstack((related_set, unrelated_set))
y_stack = np.hstack((y_related, y_unrelated))

X_stack.shape

(932,)
(932,)
(932,)
(932,)


(1864,)

In [208]:
from sklearn.feature_extraction.text import CountVectorizer

token = r"(?u)\b[\w\'/]+\b"
tf_vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, binary=False, token_pattern=token)
tf_vectorizer.set_params(ngram_range=(1,1))

# X_vectorized = tf_vectorizer.fit_transform(X_stack)

# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y_stack, test_size=0.33, random_state=42)

tf_vectorizer.fit(X_stack)

word_dict = tf_vectorizer.vocabulary_

In [209]:
word_dict

{'not': 1072,
 'perfect': 1140,
 'by': 234,
 'a': 27,
 'long': 914,
 'shot': 1406,
 'but': 232,
 'definitely': 381,
 'good': 672,
 'for': 612,
 'smile': 1433,
 'on': 1099,
 'bad': 146,
 'day': 369,
 'the': 1578,
 'whole': 1757,
 'cast': 251,
 'was': 1729,
 'great': 681,
 'each': 451,
 'character': 269,
 'had': 694,
 'their': 1581,
 'own': 1121,
 'personality': 1150,
 'and': 95,
 'charm': 272,
 'even': 515,
 'though': 1601,
 'it': 824,
 'has': 705,
 'one': 1101,
 'of': 1089,
 'standard': 1483,
 'revenge': 1306,
 'price': 1206,
 'plot': 1178,
 'this': 1595,
 'film': 584,
 'is': 821,
 'my': 1034,
 'favorite': 564,
 'vincent': 1708,
 "price's": 1207,
 'work': 1779,
 'i': 770,
 'really': 1264,
 'enjoyed': 487,
 'movie': 1023,
 'fun': 636,
 'to': 1620,
 'watch': 1731,
 'get': 657,
 'elvira': 472,
 'into': 810,
 'all': 78,
 'these': 1588,
 'adventure': 58,
 'she': 1400,
 'just': 844,
 'with': 1769,
 'more': 1015,
 'laugh': 878,
 'than': 1573,
 'any': 104,
 'other': 1114,
 'third': 1593,
 'in'

In [211]:
from textblob import TextBlob
import re

token_pattern = re.compile(token)
X = []
i=0
for sentence in X_stack:
    split = token_pattern.findall(sentence)
    seq = []
    for word in split:
        seq.append(word_dict[word])
    X.append(seq)
    
X = np.asarray(X)

print(X.shape)

(1864,)


In [196]:
y_stack[260]

1.0

In [221]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_stack, test_size=0.33, random_state=42)

In [248]:
print('Pad sequences (samples x time)')
maxlen=100
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (1248, 100)
x_test shape: (616, 100)


In [249]:
dict_len = len(tf_vectorizer.get_feature_names())
batch_size = 32
print(dict_len)

1806


In [258]:
print('Build model...')
model = Sequential()
model.add(Embedding(dict_len, 500))
model.add(LSTM(output_dim=hidden_neurons, input_dim=500))
model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam')   

Build model...


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


In [257]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, None, 500)         903000    
_________________________________________________________________
lstm_19 (LSTM)               (None, 10)                20440     
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 11        
Total params: 923,451
Trainable params: 923,451
Non-trainable params: 0
_________________________________________________________________


In [259]:

print('Train...')
model.fit(x_train, y_train,
          epochs=50)
#           validation_data=(x_test, y_test))


Train...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1ce977ecf98>

In [267]:

y_predict = model.predict(x_test)

y = []

for pred in y_predict:
    if pred > 0.5:
        y.append(1)
    else:
        y.append(0)

        
np.sum(y_test == y)/len(y)

0.21915584415584416

In [228]:
in_neurons = voc 

hidden_neurons = 10

In [12]:
# define the model
model = Sequential()
# model.add(Embedding(voc, 32, input_length=max_length))
# model.add(Flatten())
model.add(LSTM(output_dim=hidden_neurons, input_dim=in_neurons))

model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'], class_mode="binary")
# summarize the model
print(model.summary())

  """
  """


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 10)                147840    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 147,851
Trainable params: 147,851
Non-trainable params: 0
_________________________________________________________________
None
