In [1]:
# https://gist.github.com/giuseppebonaccorso/061fca8d0dfc6873619efd8f364bfe89

import keras.backend as K
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np

from math import floor

from gensim.models.word2vec import Word2Vec

from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam

from nltk.stem.lancaster import LancasterStemmer
from nltk.tokenize import RegexpTokenizer

import tflearn
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

# Set random seed (for reproducibility)
np.random.seed(1000)

# Select whether using Keras with or without GPU support
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will
use_gpu = True

config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), 
                        inter_op_parallelism_threads=multiprocessing.cpu_count(), 
                        allow_soft_placement=True, 
                        device_count = {'CPU' : 1, 
                                        'GPU' : 1 if use_gpu else 0})

session = tf.Session(config=config)
K.set_session(session)

# dataset_location = '/twitter/dataset.csv'
# model_location = '/twitter/model/'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
data = pd.read_json('data/processed.json')
data = data[200000:200100]
data.head()

Unnamed: 0,0,1,2,3
200000,you are probably asleep right now but its 925 ...,1.0,0.0,sentiment-analysis-dataset
200001,thanks,1.0,0.0,sentiment-analysis-dataset
200002,glad you found something else to do,1.0,0.0,sentiment-analysis-dataset
200003,yea it was cool shame you couldnt come,1.0,0.0,sentiment-analysis-dataset
200004,working way too latehow about you,1.0,0.0,sentiment-analysis-dataset


In [3]:
corpus = [x[0] for x in data[[0]].values]
labels = [x[[0, 1]] for x in data[[1, 2]].values]
    
print('Corpus size: {}'.format(len(corpus)))

Corpus size: 100


In [4]:
# Tokenize and stem
tkr = RegexpTokenizer('[a-zA-Z0-9]+')
stemmer = LancasterStemmer()

def tokenize_corpus(corpus):
    tokenized_corpus = []
    
    for i, tweet in enumerate(corpus):
        tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet)]
        tokenized_corpus.append(tokens)
    
    return tokenized_corpus

tokenized_corpus = tokenize_corpus(corpus);

(tokenized_corpus)

[['you',
  'ar',
  'prob',
  'asleep',
  'right',
  'now',
  'but',
  'it',
  '925',
  'am',
  'and',
  'adam',
  'lambert',
  'is',
  'on',
  'liv',
  'with',
  'reg',
  'and',
  'kel'],
 ['thank'],
 ['glad', 'you', 'found', 'someth', 'els', 'to', 'do'],
 ['ye', 'it', 'was', 'cool', 'sham', 'you', 'couldnt', 'com'],
 ['work', 'way', 'too', 'latehow', 'about', 'you'],
 ['yeah'],
 ['ed', 'that', 'tiny', 'url', 'isnt', 'work', 'boo'],
 ['going',
  'for',
  'com',
  'con',
  'hop',
  'the',
  'swin',
  'flu',
  'scar',
  'would',
  'hav',
  'died',
  'down',
  'by',
  'then'],
 ['get',
  '100',
  'follow',
  'a',
  'day',
  'us',
  'wwwtweeteraddercom',
  'ont',
  'you',
  'ad',
  'everyon',
  'you',
  'ar',
  'on',
  'the',
  'train',
  'or',
  'pay',
  'vip'],
 ['oh',
  'hello',
  'good',
  'morn',
  'yo',
  'the',
  'first',
  'person',
  'who',
  'mad',
  'me',
  'smil',
  'today'],
 ['thank', 'for', 'the', 'feedback', 'guy'],
 ['act', 'ye'],
 ['gott', 'agr', 'on', 'the', 'cap', 'rip'

In [5]:
# Gensim Word2Vec model
vector_size = 300
window_size = 10

# Create Word2Vec
word2vec = Word2Vec(sentences=tokenized_corpus,
                    size=vector_size, 
                    window=window_size, 
                    negative=20,
                    iter=50,
                    seed=1000,
                    workers=multiprocessing.cpu_count())

# Copy word vectors and delete Word2Vec model  and original corpus to save memory
X_vecs = word2vec.wv
del word2vec
del corpus

# Train subset size (0 < size < len(tokenized_corpus))
train_size = floor(len(tokenized_corpus) * .9)

# Test subset size (0 < size < len(tokenized_corpus) - train_size)
test_size = floor(len(tokenized_corpus) * .1)

# Compute average and max tweet length
avg_length = 0.0
max_length = 0

for tweet in tokenized_corpus:
    if len(tweet) > max_length:
        max_length = len(tweet)
    avg_length += float(len(tweet))
    
print('Average tweet length: {}'.format(avg_length / float(len(tokenized_corpus))))
print('Max tweet length: {}'.format(max_length))

Average tweet length: 9.69
Max tweet length: 30


In [6]:
import gc
gc.collect()

0

In [7]:
# Tweet max length (number of tokens)
max_tweet_length = 40

X_train = np.zeros((train_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_train = np.zeros((train_size, 2), dtype=np.int32)
X_test = np.zeros((test_size, max_tweet_length, vector_size), dtype=K.floatx())
Y_test = np.zeros((test_size, 2), dtype=np.int32)

for i in range(train_size + test_size):
    for t, token in enumerate(tokenized_corpus[i]):
        if t >= max_tweet_length:
            break
        
        if token not in X_vecs:
            continue
    
        if i < train_size:
            X_train[i, t, :] = X_vecs[token]
        else:
            X_test[i - train_size, t, :] = X_vecs[token]
            
    if i < train_size:
#         Y_train[i, :] = [1.0, 0.0] if labels[i] == 0 else [0.0, 1.0]
        Y_train[i, :] = labels[i]
    else:
#         Y_test[i - train_size, :] = [1.0, 0.0] if labels[i] == 0 else [0.0, 1.0]
        Y_test[i - train_size, :] = labels[i]

In [8]:
X_test

array([[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.07627117,  0.06672327,  0.10996481, ..., -0.04921505,
          0.0310328 , -0.00809618],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [-0.09522618,  0.08282398,  0.13988431, ..., -0.06200517,
          0.03767472, -0.01032154],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [9]:
# Keras convolutional model
batch_size = 32
nb_epochs = 20

model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length, vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))

model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0001, decay=1e-6),
              metrics=['accuracy'])

# Fit the model
model.fit(X_train, Y_train,
          batch_size=batch_size,
          shuffle=True,
          epochs=nb_epochs,
          validation_data=(X_test, Y_test))
# ,
#           callbacks=[EarlyStopping(min_delta=0.00025, patience=2)]

Train on 90 samples, validate on 10 samples
Epoch 1/20


UnknownError: Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[{{node conv1d_1/convolution/Conv2D}} = Conv2D[T=DT_FLOAT, _class=["loc:@train...propFilter"], data_format="NCHW", dilations=[1, 1, 1, 1], padding="SAME", strides=[1, 1, 1, 1], use_cudnn_on_gpu=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](training/Adam/gradients/conv1d_1/convolution/Conv2D_grad/Conv2DBackpropFilter-0-TransposeNHWCToNCHW-LayoutOptimizer, conv1d_1/convolution/ExpandDims_1)]]
	 [[{{node metrics/acc/Mean/_227}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_1798_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

In [None]:
tk_c = tokenize_corpus([
    "hi there my name is mike",
    "what are you trying to do",
    "i love meg so much",
    "i fuck hate this shit man",
    "okay that wasnt that bad right i mean it was okay",
    "it was not nice",
    "it was nice",
    "that was not good",
    "that was good",
])

def predict(tk_c):
    input_matrix = np.zeros((len(tk_c), max_tweet_length, vector_size), dtype=K.floatx())
    for i in range(len(tk_c)):
        for t, token in enumerate(tk_c[i]):
            if t >= max_tweet_length:
                break
            if token not in X_vecs:
                continue
            input_matrix[i, t, :] = X_vecs[token]
    return model.predict(input_matrix)

In [None]:
predict(tk_c)