In [1]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import collections
from scipy import spatial
import json
from copy import deepcopy

### Import data

In [2]:
with open('data/semeval_train_A.txt', 'r') as f:
    file = f.readlines()
    tweets = [row.split('\t')[0].replace('\n', '').lower() for row in file]
    raw_labels = [row.split('\t')[-1].replace('\n', '').lower() for row in file]

In [3]:
tokenizer = TweetTokenizer()
ps = PorterStemmer()
stop_words = set(stopwords.words("english"))
stop_words.update([',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation
whitelist = ["n't", "not", "hadn", "didn", "did",
             "no", "but", "wasn", "mustn", "was",
             "doesn", "aren", "can", "nor", "hasn",
             "does", "should", "shouldn"]
for white_word in whitelist:
    if white_word in stop_words:
        stop_words.remove(white_word)

In [4]:
def tokenize(tweet):
    try:
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        tokens = list(tokens)
        # Delete Stop Word
        tokens = [w for w in tokens if not w in stop_words]

        # Stemming
        tokens = [ps.stem(w) for w in tokens]
        return tokens
    except:
        return 'NC'

In [5]:
tweets = [tokenize(tweet) for tweet in tweets]
tweets = [tweet for tweet in tweets if tweet != 'NC']

### Pre-trained embedding

In [7]:
vocab = []
embd = []
with open('W2V_MODEL/vocab_vector1.json', 'r', encoding='UTF-8') as f:
    vectors = json.loads(f.read())
    for key in vectors:
        vocab.append(key)
        embd.append(vectors[key])

vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd, dtype=np.float32)
first_zeros = np.zeros_like(embedding[0])
embedding = np.vstack((first_zeros, embedding))

In [8]:
counts = collections.Counter(vocab)
vocab_1 = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: i for i, word, in enumerate(vocab_1, 1)}

vocab_to_int
tweet_ints = []
except_set = set()
success_idxs = []

for i in range(len(tweets)):
    try:
        tweet_ints.append([vocab_to_int[word] for word in tweets[i]])
    except KeyError as e:
        except_set.add(e.args[0])
    else:
        success_idxs.append(i)
        #success_idxs[i] = True

In [9]:
len(except_set)

3411

In [10]:
len(tweet_ints)

7699

### Encoding the labels

In [51]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labels = np.asarray(deepcopy(raw_labels))
enc = LabelEncoder()
labels = enc.fit_transform(labels)
enc = OneHotEncoder(sparse=False)
labels = labels.reshape(-1, 1)
labels = enc.fit_transform(labels)

labels = labels[np.asarray(success_idxs)]

In [52]:
x_len = len(labels)
len(labels)

7699

In [53]:
tweet_lens = collections.Counter([len(x) for x in tweet_ints])
print("Zero-length reviews: {}".format(tweet_lens[0]))
print("Minimum review length: {}".format(min(tweet_lens)))
print("Maximum review length: {}".format(max(tweet_lens)))

Zero-length reviews: 0
Minimum review length: 2
Maximum review length: 30


### Make Input Data

In [54]:
features = np.zeros((len(tweet_ints), max(tweet_lens)), dtype=int)
for i, row in enumerate(tweet_ints):
    features[i, :len(row)] = np.array(row)

features.shape

(7699, 30)

### Training, Validation, Test

In [55]:
split_frac = 0.8
split_index = int(split_frac * len(features))

train_x, val_x = features[:split_index], features[split_index:]
train_y, val_y = labels[:split_index], labels[split_index:]

split_frac = 0.5
split_index = int(split_frac * len(val_x))

val_x, test_x = val_x[:split_index], val_x[split_index:]
val_y, test_y = val_y[:split_index], val_y[split_index:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
print("label set: \t\t{}".format(train_y.shape), 
      "\nValidation label set: \t{}".format(val_y.shape),
      "\nTest label set: \t{}".format(test_y.shape))

			Feature Shapes:
Train set: 		(6159, 30) 
Validation set: 	(770, 30) 
Test set: 		(770, 30)
label set: 		(6159, 3) 
Validation label set: 	(770, 3) 
Test label set: 	(770, 3)


## Build the graph

In [153]:
lstm_size = 128
lstm_layers = 2
batch_size = 400
learning_rate = 0.01

# sequence_lengths
train_seq = np.count_nonzero(train_x, axis=1)
test_seq = np.count_nonzero(test_x, axis=1)
val_seq = np.count_nonzero(val_x, axis=1)

In [154]:
tf.reset_default_graph()
with tf.name_scope('inputs'):
    inputs_ = tf.placeholder(tf.int32, [batch_size, max_sequence_length], name='inputs')
    labels_ = tf.placeholder(tf.int32, [batch_size, num_classes], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')

### Embedding

In [155]:
with tf.name_scope('Embeddings'):
    data = tf.Variable(tf.zeros([batch_size, max_sequence_length, num_classes]), dtype=tf.float32)
    data = tf.nn.embedding_lookup(embedding, inputs_)
    seq_length = tf.placeholder(tf.int32, [batch_size], name='seq_length')

### LSTM Cell

In [156]:
def lstm_cell():
    lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size, reuse=tf.get_variable_scope().reuse)
    return tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)

with tf.name_scope('RNN_layers'):
    cell = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(lstm_layers)])
    initial_state = cell.zero_state(batch_size, tf.float32)

### Forward

In [159]:
with tf.name_scope('RNN_forward'):
    outputs, final_state = tf.nn.dynamic_rnn(cell, data,
                                             sequence_length=seq_length, dtype=tf.float32)

### Output

In [160]:
with tf.name_scope('predictions'):
    predictions = tf.contrib.layers.fully_connected(outputs[:, -1], num_classes, activation_fn=None)
    tf.summary.histogram('predictions', predictions)
with tf.name_scope('cost'):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=predictions, labels=labels_))
    tf.summary.scalar('cost', cost)
    
with tf.name_scope('train'):
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

### Validation

In [161]:
with tf.name_scope('validation'):
    correct_pred = tf.equal(tf.argmax(predictions, 1), tf.argmax(labels_, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    tf.summary.scalar('accuracy', accuracy)
    
merged = tf.summary.merge_all()

### Batching

In [162]:
def get_batches(x, y, seq, batch_size):
    n_batches = len(x)//batch_size
    x, y, seq = x[:n_batches*batch_size], y[:n_batches*batch_size], seq[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size], seq[ii:ii+batch_size]

## Training

In [164]:
epochs = 10

# with graph.as_default():
saver = tf.train.Saver()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter('./logs/tb/train', sess.graph)
    test_writer = tf.summary.FileWriter('./logs/tb/test', sess.graph)
    iteration = 1
    for e in range(epochs):
        #state = sess.run(initial_state)
        
        for ii, (x, y, seq) in enumerate(get_batches(train_x, train_y, train_seq, batch_size), 1):
            feed = {inputs_: x,
                    labels_: y,
                    keep_prob: 0.5,
                    #initial_state: state,
                    seq_length: seq}
            summary, loss, state, _ = sess.run([merged, cost, final_state, optimizer], feed_dict=feed)
#             loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)

            train_writer.add_summary(summary, iteration)
        
            if iteration%5==0:
                print("Epoch: {}/{}".format(e, epochs),
                      "Iteration: {}".format(iteration),
                      "Train loss: {:.3f}".format(loss))

            if iteration%25==0:
                val_acc = []
                #val_state = sess.run(cell.zero_state(batch_size, tf.float32))
                for x, y, seq in get_batches(val_x, val_y, val_seq, batch_size):
                    feed = {inputs_: x,
                            labels_: y,
                            keep_prob: 1,
                            #initial_state: val_state,
                            seq_length: seq}
#                     batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
                    summary, batch_acc, val_state = sess.run([merged, accuracy, final_state], feed_dict=feed)
                    val_acc.append(batch_acc)
                print("Val acc: {:.3f}".format(np.mean(val_acc)))
            iteration +=1
            test_writer.add_summary(summary, iteration)
            saver.save(sess, "checkpoints/sentiment_manish.ckpt")
    saver.save(sess, "checkpoints/sentiment_manish.ckpt")

TypeError: Fetch argument array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]], dtype=float32) has invalid type <class 'numpy.ndarray'>, must be a string or Tensor. (Can not convert a ndarray into a Tensor or Operation.)