In [6]:
# https://gist.github.com/giuseppebonaccorso/061fca8d0dfc6873619efd8f364bfe89

import keras.backend as K
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np

from math import floor

from gensim.models.word2vec import Word2Vec

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# Set random seed (for reproducibility)
np.random.seed(1000)

# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)

# dataset_location = '/twitter/dataset.csv'
# model_location = '/twitter/model/'

In [11]:
data = pd.read_json('data/processed.json')
data = data[200000:200100]
data.head()

Unnamed: 0,0,1,2
200000,easedaman they wiggle independently too much f...,0.0,1.0
200001,easedaman well if u had my life u would see ho...,1.0,0.0
200002,easegill good point in meantime if already lo...,1.0,0.0
200003,easegill you wont be looking for it on kiwi tv...,1.0,0.0
200004,easidream hi,0.0,1.0


In [12]:
corpus = [x[0] for x in data[[0]].values]
labels = [x[[0, 1]] for x in data[[1, 2]].values]
    
print('Corpus size: {}'.format(len(corpus)))

Corpus size: 100


In [13]:
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9]+')
stemmer = LancasterStemmer()

def tokenize_corpus(corpus):
    tokenized_corpus = []
    
    for i, tweet in enumerate(corpus):
        tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet)]
        tokenized_corpus.append(tokens)
    
    return tokenized_corpus

tokenized_corpus = tokenize_corpus(corpus);

(tokenized_corpus)

[['easedam', 'they', 'wiggl', 'independ', 'too', 'much', 'fri', 'chick'],
 ['easedam',
  'wel',
  'if',
  'u',
  'had',
  'my',
  'lif',
  'u',
  'would',
  'see',
  'how',
  'smart',
  'mem',
  'in',
  'miam',
  'is'],
 ['easegil',
  'good',
  'point',
  'in',
  'meantim',
  'if',
  'already',
  'log',
  'into',
  'a',
  'googl',
  'produc'],
 ['easegil',
  'you',
  'wont',
  'be',
  'look',
  'for',
  'it',
  'on',
  'kiw',
  'tv',
  'someth',
  'of',
  'a',
  'cult',
  'follow',
  'her',
  'al',
  'in',
  'fun',
  'as',
  'noon',
  'tak',
  'it',
  'sery',
  'her'],
 ['easidream', 'hi'],
 ['easierp',
  'im',
  'going',
  'out',
  'soon',
  'i',
  'think',
  'i',
  'wil',
  'tak',
  'my',
  'sunglass',
  'and',
  'an',
  'umbrell',
  'brisbaneweath'],
 ['easilyamusedtx', 'i', 'cri', 'again', 'see', 'it'],
 ['easlydstract', 'hes', 'been', 'out'],
 ['easmart',
  'ughhh',
  'i',
  'cant',
  'believ',
  'wer',
  'stay',
  'at',
  'diff',
  'hotel',
  'i',
  'want',
  'to',
  'party',
  '

In [14]:
# Gensim Word2Vec model
vector_size = 300
window_size = 10

# Create Word2Vec
word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

# Copy word vectors and delete Word2Vec model  and original corpus to save memory
X_vecs = word2vec.wv
del word2vec
del corpus

# Train subset size (0 < size < len(tokenized_corpus))
train_size = floor(len(tokenized_corpus) * .9)

# Test subset size (0 < size < len(tokenized_corpus) - train_size)
test_size = floor(len(tokenized_corpus) * .1)

# Compute average and max tweet length
avg_length = 0.0
max_length = 0

for tweet in tokenized_corpus:
    if len(tweet) > max_length:
        max_length = len(tweet)
    avg_length += float(len(tweet))
    
print('Average tweet length: {}'.format(avg_length / float(len(tokenized_corpus))))
print('Max tweet length: {}'.format(max_length))

Average tweet length: 10.84
Max tweet length: 26


In [15]:
import gc
gc.collect()

0

In [16]:
# Tweet max length (number of tokens)
max_tweet_length = 40

X_train = np.zeros((train_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_train = np.zeros((train_size, 2), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_test = np.zeros((test_size, 2), dtype=np.int32)

for i in range(train_size + test_size):
    for t, token in enumerate(tokenized_corpus[i]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
    if i < train_size:
#         Y_train[i, :] = [1.0, 0.0] if labels[i] == 0 else [0.0, 1.0]
        Y_train[i, :] = labels[i]
    else:
#         Y_test[i - train_size, :] = [1.0, 0.0] if labels[i] == 0 else [0.0, 1.0]
        Y_test[i - train_size, :] = labels[i]

In [20]:
len(X_train[0,0])

300

In [8]:
# Keras convolutional model
batch_size = 32
nb_epochs = 100

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

# Fit the model
model.fit(X_train, Y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=nb_epochs,
          validation_data=(X_test, Y_test))
# ,
#           callbacks=[EarlyStopping(min_delta=0.00025, patience=2)]

Train on 180000 samples, validate on 20000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 

In [None]:
tk_c = tokenize_corpus([
    "hi there my name is mike",
    "what are you trying to do",
    "i love meg so much",
    "i fuck hate this shit man",
    "okay that wasnt that bad right i mean it was okay",
    "it was not nice",
    "it was nice",
    "that was not good",
    "that was good",
])

def predict(tk_c):
    input_matrix = np.zeros((len(tk_c), max_tweet_length, vector_size), dtype=K.floatx())
    for i in range(len(tk_c)):
        for t, token in enumerate(tk_c[i]):
            if t >= max_tweet_length:
                break
            if token not in X_vecs:
                continue
            input_matrix[i, t, :] = X_vecs[token]
    return model.predict(input_matrix)

In [None]:
predict(tk_c)