In [1]:
import keras.backend as K
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np

from math import ceil

from gensim.models.word2vec import Word2Vec

from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from gensim.models import KeyedVectors

from keras.preprocessing.text import text_to_word_sequence

import os.path
import gc

from time import gmtime, strftime

from IPython.display import clear_output

import seaborn as sns
import matplotlib.pyplot as plt

def print_time():
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

gc.collect()
print_time()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


2018-11-25 05:29:43


In [2]:
print_time()
# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)
print_time()

2018-11-25 05:29:43
2018-11-25 05:29:46


In [20]:
print_time()
data = pd.read_json('data/processed.json')
print_time()
print(len(data))
data = data.dropna()
print(len(data))
data = data.drop_duplicates()
print(len(data))
data = data[data[0] != '']
print(len(data))
data = data[data[3] != 'word-list']
data = data[data[3] != 'standford']
data = data[data[3] != 'imdb']
data = data[data[3] != 'reviews']
data = data[data[3] != 'twitter-airline-sentiment']
data = data[data[3] != 'tweets']
# data = data[data[3] == 'sentiment-analysis-dataset']
data = data[data[3] == 'stanford']

print(len(data))

2018-11-25 06:16:26
2018-11-25 06:16:36
2040155
2040154
1868325
1868325
208455


In [21]:
data[1].value_counts()

0.0    116603
1.0     91852
Name: 1, dtype: int64

In [22]:
data[2].value_counts()

0.0    133070
1.0     75385
Name: 2, dtype: int64

In [23]:
counts = dict(data[3].value_counts())

for k in counts:
    filtered = data[data[3] == k]
    total = np.int(counts[k])
    print('Total of {} is {}'.format(k, total))
    print('Positive {}'.format(len(filtered[filtered[1] == 1])/total))  
    print('Negative {}'.format(len(filtered[filtered[2] == 1])/total))
    print('Text max {}'.format(max([x[0] for x in np.array(filtered[[0]])], key=len)))   

    print('\n\n')

Total of stanford is 208455
Positive 0.44063227075387973
Negative 0.36163680410640187
Text max sitting in the third row of the imax cinema at sydney s darling harbour  but i sometimes felt as though i was in the tiny two seater plane that carried the giant camera around australia  sweeping and gliding  banking and hovering over some of the most not





In [28]:
print_time()
corpus = [text_to_word_sequence(y) for y in [x[0] for x in data[[0]].values]]
labels = [np.array(x[[0, 1]]) for x in data[[1, 2]].values]
    
print('Corpus size: {}'.format(len(corpus)))
print_time()

2018-11-25 06:17:41
Corpus size: 208455
2018-11-25 06:17:45


In [8]:
print_time()
# Gensim Word2Vec model
vector_size = 300
window_size = 10

# word2vec_name = 'w2v.bin'
word2vec_name = 'GoogleNews-vectors-negative300.bin'
word2vec = None

# Create Word2Vec
if os.path.isfile(word2vec_name): 
    print("Loading...")
#     word2vec = Word2Vec.load(word2vec_name)
    word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

else:
    print("Computing...")
    word2vec = Word2Vec(sentences=corpus,
                        size=vector_size, 
                        window=window_size, 
                        negative=20,
                        iter=50,
                        seed=1000,
                        workers=multiprocessing.cpu_count())
    word2vec.save(word2vec_name)

# Take vectors of tokens and discard 
vecs_x = word2vec.wv
del word2vec

gc.collect()
print_time()

2018-11-25 05:30:23
Loading...
2018-11-25 05:31:06




In [29]:
def message_mike(text):
    TLGRM_SECRET = '747320373:AAFGP2XI3OtGJ-CqE7z41RRnrR_gq00jeMM'
    TLGRM_MIKE = '248923795'
    TLGRM_URL = 'http://api.telegram.org/bot' + TLGRM_SECRET + '/sendmessage?chat_id=' + TLGRM_MIKE + '&text=' 

    import urllib.request
    from urllib.parse import quote
    contents = urllib.request.urlopen(TLGRM_URL + quote(text, safe='')).read()
    print(text)

In [30]:
# Matching tokens with vectors
max_sent_length = 35

def pad_vec_data(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus), max_sent_length, vector_size), dtype=K.floatx())
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sent_length:
                break
            if token not in vecs_x:
                continue
            input_matrix[i, t, :] = vecs_x[token]
    return input_matrix

In [31]:
# Keras convolutional model
gc.collect()
batch_size = 32
nb_epochs = 20

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_sent_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

def compile_model():
    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001, decay=1e-6),
                  metrics=['accuracy'])

In [32]:
def test_sentences():
    mess = [
        "hello there, my name is mike",
        "what the fuck haha",
        "i don't fucking like you man",
        "considering how the usual process for this is tedious, having an app like this solves a lot of problems",
        "thank you for this",
        "this is amazing",
        "this is nothing short of amazing, I cannot believe it",
        "how are you",
        "i really like you",
        "i don't actually like this product!",
        "it's useless if you can't use it!",
        "i can't believe i never heard of this",
        "this is awesome, didn't know I needed this",
        "so what am i supposed to use this for?",
        "what do I need this for?",
        "this is good",
        "this is not good",
        "although the movie was great, it lacked impact",
        "the movie wasnt that nice",
        "the movie was nice",
        "this is not acceptable, I lost everything using your app",
        "that was kinda stupid",
        "the instructions were unlear and is not friendly for non-techy people",
        "this is really useful i would definitely tell everyone about it",
        "i need to try this!"
    ]
    pred = model.predict(pad_vec_data(mess))
    output = ''

    for i ,m in enumerate(mess):
        output += ('{} {} {}\n'.format('POSITIVE:' if pred[i][0] > 0.5 else 'NEGATIVE:', m, pred[i]))
        
    message_mike(output)

In [33]:
def test_accuracy_all():
    gc.collect()
    scores = []
    train_batch_size = 1024
    total_batches = ceil(len(corpus) / train_batch_size)

    for index in range(0, total_batches):
            
        print_time()
        start = index * train_batch_size
        end = min(len(corpus), start + train_batch_size - 1)

        corpus_batch = corpus[start:end]
        label_batch = labels[start:end]
        
        corpus_pad_vec_data = pad_vec_data(corpus_batch)
        pred = model.predict(corpus_pad_vec_data)
        
        for i, text in enumerate(pred):
            label = labels[i]
            scores.append(1 if round(pred[0][0]) == label[0] else 0)
            clear_output()
            print(np.mean(scores), i + start + 1, len(corpus))
            print('Batch {} of {}'.format((index + 1), total_batches))

In [34]:
model_file = "keras-model.h5"

# https://gist.github.com/Hironsan/e041d6606164bc14c50aa56b989c5fc0

def batch_iter(data, labels, batch_size, shuffle=False):
    num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1

    def data_generator():
        data_size = len(data)
        while True:
            # Shuffle the data at each epoch
            if shuffle:
                shuffle_indices = np.random.permutation(np.arange(data_size))
                shuffled_data = data[shuffle_indices]
                shuffled_labels = labels[shuffle_indices]
            else:
                shuffled_data = data
                shuffled_labels = labels

            for batch_num in range(num_batches_per_epoch):
                start_index = batch_num * batch_size
                end_index = min((batch_num + 1) * batch_size, data_size)
                X, y = pad_vec_data(shuffled_data[start_index: end_index]), shuffled_labels[start_index: end_index]
                yield X, y

    return num_batches_per_epoch, data_generator()
    
def batch_train_model():
    batch_size = 100
    train_x, test_x, train_y, test_y = train_test_split(corpus, labels)
    train_x = np.array(train_x)
    test_x = np.array(test_x)
    train_y = np.array(train_y)
    test_y = np.array(test_y)
    gc.collect()
    
    train_steps, train_batches = batch_iter(train_x, train_y, batch_size)
    valid_steps, valid_batches = batch_iter(test_x, test_y, batch_size)
    
    model.fit_generator(
        train_batches, 
        train_steps, 
        epochs=1, 
        validation_data=valid_batches, 
        validation_steps=valid_steps)

# Run multiple batches
def train_model():
    train_batch_size = 10000
    total_batches = ceil(len(corpus) / train_batch_size)

    for index in range(0, total_batches):
        clear_output()
        print_time()
        message_mike('Batch {} of {}'.format((index + 1), total_batches))
        start = index * train_batch_size
        end = min(len(corpus), start + train_batch_size - 1)

        corpus_batch = corpus[start:end]
        label_batch = labels[start:end]

        corpus_pad_vec_data = pad_vec_data(corpus_batch)
        gc.collect()

        train_x, test_x, train_y, test_y = train_test_split(corpus_pad_vec_data, label_batch)

        train_x = np.array(train_x)
        test_x = np.array(test_x)
        train_y = np.array(train_y)
        test_y = np.array(test_y)
        gc.collect()

        history = model.fit(train_x, train_y,
                  batch_size=batch_size,
                  shuffle=True,
                  epochs=nb_epochs,
                  validation_data=(test_x, test_y),
#                   verbose=0,
                  callbacks=[
#                       EarlyStopping(min_delta=0.000025, patience=10),
                  ])    

        message_mike('{}'.format(history.history['acc']))
        test_sentences()
        gc.collect()
    
    # Save model
    print("Saving model")
    model.save_weights(model_file)

In [None]:
force_train = True

if os.path.isfile(model_file) and not force_train:
    print("Loading model...")
    model.load_weights(model_file)
    compile_model()
else:
    print("Training model...")
    compile_model()
    train_model()
#     batch_train_model()

2018-11-25 06:18:54
Batch 2 of 21
Train on 7499 samples, validate on 2500 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

In [None]:
test_sentences()

In [None]:
test_accuracy_all()

In [None]:
gc.collect()