In [None]:
import keras.backend as K
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np

from math import ceil

from gensim.models.word2vec import Word2Vec

from sklearn.model_selection import train_test_split

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from keras.preprocessing.text import text_to_word_sequence

import os.path
import gc

from time import gmtime, strftime

from IPython.display import clear_output

def print_time():
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

gc.collect()
print_time()

In [None]:
print_time()
# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)
print_time()

In [None]:
print_time()
data = pd.read_json('data/processed.json')
offset = 500000
length = 1000
# data = data[offset:offset+length]
print_time()
data.head()

In [None]:
print_time()
corpus = [text_to_word_sequence(y) for y in [x[0] for x in data[[0]].values]]
labels = [np.array(x[[0, 1]]) for x in data[[1, 2]].values]
    
print('Corpus size: {}'.format(len(corpus)))
print_time()

In [None]:
print_time()
# Gensim Word2Vec model
vector_size = 300
window_size = 10

word2vec_name = 'w2v.bin'
word2vec = None

# Create Word2Vec
if os.path.isfile(word2vec_name): 
    print("Loading...")
    word2vec = Word2Vec.load(word2vec_name)
else:
    print("Computing...")
    word2vec = Word2Vec(sentences=corpus,
                        size=vector_size, 
                        window=window_size, 
                        negative=20,
                        iter=50,
                        seed=1000,
                        workers=multiprocessing.cpu_count())
    word2vec.save(word2vec_name)

# Take vectors of tokens and discard 
vecs_x = word2vec.wv
del word2vec

gc.collect()
print_time()

In [None]:
# Matching tokens with vectors
max_sent_length = 35

def pad_vec_data(corpus):
    gc.collect()
    input_matrix = np.zeros((len(corpus), max_sent_length, vector_size), dtype=K.floatx())
    for i in range(len(corpus)):
        for t, token in enumerate(corpus[i]):
            if t >= max_sent_length:
                break
            if token not in vecs_x:
                continue
            input_matrix[i, t, :] = vecs_x[token]
    return input_matrix

In [None]:
# Keras convolutional model
batch_size = 32
nb_epochs = 100

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_sent_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

def compile_model():
    # Compile the model
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001, decay=1e-6),
                  metrics=['accuracy'])

In [None]:
model_file = "keras-model.h5"

# Run multiple batches
def train_model():
    train_batch_size = 10000
    total_batches = ceil(len(corpus) / train_batch_size)

    for index in range(0, total_batches):
        clear_output()
        print_time()
        print('Batch {} of {}'.format((index + 1), total_batches))
        start = index * train_batch_size
        end = min(len(corpus), start + train_batch_size - 1)

        corpus_batch = corpus[start:end]
        label_batch = labels[start:end]

        corpus_pad_vec_data = pad_vec_data(corpus_batch)
        gc.collect()

        train_x, test_x, train_y, test_y = train_test_split(corpus_pad_vec_data, label_batch)

        train_x = np.array(train_x)
        test_x = np.array(test_x)
        train_y = np.array(train_y)
        test_y = np.array(test_y)
        gc.collect()

        # Fit the model
        model.fit(train_x, train_y,
                  batch_size=batch_size,
                  shuffle=True,
                  epochs=nb_epochs,
                  validation_data=(test_x, test_y),
#                   verbose=0,
                  callbacks=[
#                       EarlyStopping(min_delta=0.000025, patience=10),
                  ])

        gc.collect()
    
    # Save model
    print("Saving model")
    model.save_weights(model_file)

In [None]:
force_train = True

if os.path.isfile(model_file) and not force_train:
    print("Loading model...")
    model.load_weights(model_file)
    compile_model()
else:
    print("Training model...")
    compile_model()
    train_model()

In [None]:
mess = [
    "hello there, my name is mike",
    "what the fuck haha",
    "i don't fucking like you man",
    "don't do this to me dude",
    "i really like you",
    "i don't actually like this product!",
    "it's useless if you can't use it!",
    "this is good!",
    "this is not good",
]
pred = model.predict(pad_vec_data(mess))

for i ,m in enumerate(mess):
    print(pred[i], m)