In [1]:
import keras.backend as K
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np

from math import ceil

from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from keras.preprocessing.text import text_to_word_sequence

import os.path
import gc

from time import gmtime, strftime

import seaborn as sns
import matplotlib.pyplot as plt

gc.collect()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


0

In [2]:
from keras.preprocessing.text import text_to_word_sequence

stop_words = pd.read_csv('../data/stopwords.csv')['words'].values

def remove_stop_words(text):
    word_tokens = text_to_word_sequence(text) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(filtered_sentence)

print(stop_words)

['only' 'y' 'by' 'am' 'most' 'me' 'same' 'these' 'so' 'some' 'why' 'down'
 'had' 'd' 'at' 'having' 'those' 'has' 'few' 'theirs' "you've" 'more' 'i'
 'than' 'through' 'be' 'what' 'where' 'myself' 'which' 'doing' 'ours'
 'will' 'in' 'both' 'do' 'it' 'o' 'on' 'yours' 'once' 'ourselves' 'here'
 'about' "it's" 'my' 'for' 'her' 'then' 'after' "should've" 'from' 'each'
 'when' 'does' 'now' 'off' 'don' 'are' 'we' 'itself' 'should' 'his'
 'between' 'our' 'were' 'under' 'other' 'all' 'she' 'won' 'been' "you're"
 'how' 'did' 'yourself' 'they' 'into' 'there' 've' 'such' 't' 's' 'and'
 'over' 'to' 'just' 'was' 'being' 'because' 'if' 'who' 'further' 'the'
 'any' "that'll" 'themselves' 'as' 'again' "you'd" 'until' 'he' 'him'
 'this' 'or' 'of' 'below' 'an' "she's" 'weren' 'm' 'their' 'ma' 'up' 'll'
 'whom' 'hers' 'can' 'you' 'them' 'very' 'a' 'herself' 'before' 'too'
 'himself' 'during' 're' 'out' 'its' 'above' 'own' 'have' 'while'
 'yourselves' 'that' 'with' "you'll" 'is' 'your']


In [3]:
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)

In [4]:
data = pd.read_csv('../data/twitter-airline-sentiment.csv')

In [5]:
data['text'] = list(map(remove_stop_words, data['text'].values))
data.head()

Unnamed: 0,text,pos,neg
0,said,0.0,0.0
1,plus youve added commercials experience tacky,1.0,0.0
2,didnt today must mean need take another trip,0.0,0.0
3,really aggressive blast obnoxious entertainmen...,0.0,1.0
4,really big bad thing,0.0,1.0


In [6]:
corpus = [text_to_word_sequence(y) for y in [x[0] for x in data[['text']].values]]
labels = [np.array(x[[0, 1]]) for x in data[['pos', 'neg']].values]
    
print('Corpus size: {}'.format(len(corpus)))

Corpus size: 13871


In [7]:
max_sentence_length = 35
vector_size = 300

In [8]:
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors

word2vec = Word2Vec(sentences=corpus,
                    size=vector_size, 
                    window=10, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

vecs_x = word2vec.wv
gc.collect()

7

In [9]:
def pad_vec_data(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus), max_sentence_length, vector_size), dtype=K.floatx())
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sentence_length:
                break
            if token not in vecs_x:
                continue
            input_matrix[i, t, :] = vecs_x[token]
    return input_matrix

In [10]:
# Keras convolutional model
gc.collect()
batch_size = 16
nb_epochs = 20

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_sentence_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

In [11]:
gc.collect()

train_x, test_x, train_y, test_y = train_test_split(
            pad_vec_data(corpus), 
            labels, 
            test_size=0.2, 
            random_state=3945)

train_x = np.array(train_x)
test_x = np.array(test_x)
train_y = np.array(train_y)
test_y = np.array(test_y)
gc.collect()

history = model.fit(train_x, train_y,
          batch_size=batch_size,
          shuffle=True,
          epochs=nb_epochs,
          validation_data=(test_x, test_y),
#                   verbose=0,
          callbacks=[
#                       EarlyStopping(min_delta=0.000025, patience=10),
          ])    

Train on 11096 samples, validate on 2775 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [12]:
mess = [
    "hello there, my name is mike",
    "not something i wanted to happen but i think i need to add more words for it to see",
    "this was something i wanted i'm grateful for this app and i am looking forward to more things from you guys",
    "i don't fucking like you man",
    "considering how the usual process for this is tedious, having an app like this solves a lot of problems",
    "thank you that was some wonderful service that really helped me get from point A to point B",
    "this is amazing and was able to shorten the amount of time i needed to take to achieve this",
    "this is nothing short of amazing, I cannot believe it",
    "how are you",
    "i really like you",
    "i don't actually like this product!",
    "it's useless if you can't use it!",
    "i can't believe i never heard of this",
    "this is awesome, didn't know I needed this",
    "so what am i supposed to use this for?",
    "what do I need this for?",
    "this is good",
    "this is not good",
    "although the movie was great, it lacked impact",
    "the movie wasnt that nice",
    "the movie was nice",
    "this is not acceptable, I lost everything using your app",
    "that was kinda stupid",
    "the instructions were unlear and is not friendly for non-techy people",
    "this is really useful i would definitely tell everyone about it",
    "i need to try this!",
    "which is your favourite harry potter filmsorcerers stonechamber of secretsprisoner of azkabangoblet of firei like them all equallyi hate harry potter and think this is a stupid question",
    "sitting in the third row of the imax cinema at sydney s darling harbour  but i sometimes felt as though i was in the tiny two seater plane that carried the giant camera around australia  sweeping and gliding  banking and hovering over some of the most not",
]

pred = model.predict(pad_vec_data(list(map(remove_stop_words, mess))))
output = ''

for i ,m in enumerate(mess):
    output += ('{} {} {}\n'.format('POSITIVE:' if pred[i][0] > 0.5 else 'NEGATIVE:', m, pred[i]))

print(output)

NEGATIVE: hello there, my name is mike [0.00110999 0.99889   ]
NEGATIVE: not something i wanted to happen but i think i need to add more words for it to see [0.00417716 0.9958228 ]
NEGATIVE: this was something i wanted i'm grateful for this app and i am looking forward to more things from you guys [0.33732253 0.66267747]
NEGATIVE: i don't fucking like you man [0.01512365 0.9848763 ]
NEGATIVE: considering how the usual process for this is tedious, having an app like this solves a lot of problems [0.12617864 0.8738213 ]
POSITIVE: thank you that was some wonderful service that really helped me get from point A to point B [0.99155957 0.00844036]
POSITIVE: this is amazing and was able to shorten the amount of time i needed to take to achieve this [0.79982466 0.20017536]
POSITIVE: this is nothing short of amazing, I cannot believe it [0.91864926 0.0813507 ]
NEGATIVE: how are you [0.23402707 0.765973  ]
NEGATIVE: i really like you [9.1120700e-04 9.9908876e-01]
NEGATIVE: i don't actually like 