In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from unidecode import unidecode
import tensorflow as tf
import pandas as pd
from tqdm import tqdm
import time



In [2]:
import json

with open('tokenization.json') as fopen:
    dataset = json.load(fopen)
texts = dataset['texts']
labels = dataset['labels']
del dataset

In [3]:
x, y = [], []
for i in tqdm(range(len(texts))):
    s = ' '.join(texts[i])
    if len(s) > 5:
        x.append(s)
        y.append(labels[i])

100%|██████████| 420516/420516 [00:00<00:00, 1181458.31it/s]


In [4]:
bow_chars = CountVectorizer(ngram_range=(3, 5), analyzer='char_wb', max_features=300000).fit(x)
delattr(bow_chars, 'stop_words_')

In [5]:
feature_shape = bow_chars.transform(x).shape[1]
feature_shape

300000

In [6]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return tf.SparseTensorValue(indices, coo.col, coo.shape), tf.SparseTensorValue(indices, coo.data, coo.shape)

In [7]:
class Model:
    def __init__(self, output_size, vocab_size, learning_rate):
        self.X = tf.sparse_placeholder(tf.int32)
        self.W = tf.sparse_placeholder(tf.int32)
        self.Y = tf.placeholder(tf.int32, [None])
        embeddings = tf.Variable(tf.truncated_normal([vocab_size,128]))
        embed = tf.nn.embedding_lookup_sparse(embeddings, self.X, self.W, combiner='mean')
        self.logits = tf.layers.dense(embed, output_size)
        self.cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits = self.logits, labels = self.Y))
        self.optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(self.cost)
        correct_pred = tf.equal(tf.argmax(self.logits, 1,output_type=tf.int32), self.Y)
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [8]:
sess = tf.InteractiveSession()
model = Model(6, feature_shape, 1e-4)
sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [9]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text-char/model.ckpt')

'fast-text-char/model.ckpt'

In [10]:
vectors = bow_chars.transform(x)
train_X, test_X, train_Y, test_Y = train_test_split(
    vectors, y, test_size = 0.2
)

In [11]:
from tqdm import tqdm
import time

batch_size = 60
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, train_X.shape[0], batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(train_X[i : min(i + batch_size, train_X.shape[0])])
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x[0],
                model.W: batch_x[1],
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, test_X.shape[0], batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.Y: batch_y,
                model.X: batch_x[0],
                model.W: batch_x[1],
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= train_X.shape[0] / batch_size
    train_acc /= train_X.shape[0] / batch_size
    test_loss /= test_X.shape[0] / batch_size
    test_acc /= test_X.shape[0] / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.00it/s, accuracy=0.694, cost=0.995]
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 327.97it/s, accuracy=0.873, cost=0.72] 
train minibatch loop:   0%|          | 3/5604 [00:00<03:53, 23.97it/s, accuracy=0.8, cost=0.963]  

epoch: 0, pass acc: 0.000000, current acc: 0.734738
time taken: 237.73985719680786
epoch: 0, training loss: 1.322986, training acc: 0.597243, valid loss: 0.929151, valid acc: 0.734738



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.01it/s, accuracy=0.722, cost=0.741]
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 327.22it/s, accuracy=0.891, cost=0.481]
train minibatch loop:   0%|          | 3/5604 [00:00<03:53, 23.95it/s, accuracy=0.817, cost=0.629]

epoch: 1, pass acc: 0.734738, current acc: 0.789286
time taken: 237.64500761032104
epoch: 1, training loss: 0.761487, training acc: 0.772427, valid loss: 0.667851, valid acc: 0.789286



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.03it/s, accuracy=0.722, cost=0.635]
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 326.99it/s, accuracy=0.891, cost=0.445]
train minibatch loop:   0%|          | 3/5604 [00:00<03:53, 24.02it/s, accuracy=0.85, cost=0.527]

epoch: 2, pass acc: 0.789286, current acc: 0.806228
time taken: 237.62980842590332
epoch: 2, training loss: 0.616760, training acc: 0.803621, valid loss: 0.595524, valid acc: 0.806228



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 23.97it/s, accuracy=0.722, cost=0.571]
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 327.52it/s, accuracy=0.891, cost=0.441]
train minibatch loop:   0%|          | 3/5604 [00:00<03:52, 24.14it/s, accuracy=0.833, cost=0.488]

epoch: 3, pass acc: 0.806228, current acc: 0.813711
time taken: 237.60289216041565
epoch: 3, training loss: 0.563545, training acc: 0.816310, valid loss: 0.562587, valid acc: 0.813711



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.04it/s, accuracy=0.75, cost=0.532] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 325.83it/s, accuracy=0.873, cost=0.444]
train minibatch loop:   0%|          | 3/5604 [00:00<03:53, 24.03it/s, accuracy=0.85, cost=0.474] 

epoch: 4, pass acc: 0.813711, current acc: 0.816565
time taken: 237.62503623962402
epoch: 4, training loss: 0.534294, training acc: 0.823462, valid loss: 0.543958, valid acc: 0.816565



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.03it/s, accuracy=0.75, cost=0.506] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 329.01it/s, accuracy=0.873, cost=0.447]
train minibatch loop:   0%|          | 3/5604 [00:00<03:54, 23.84it/s, accuracy=0.85, cost=0.47]

epoch: 5, pass acc: 0.816565, current acc: 0.818671
time taken: 237.59084272384644
epoch: 5, training loss: 0.515161, training acc: 0.828170, valid loss: 0.532342, valid acc: 0.818671



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.07it/s, accuracy=0.75, cost=0.488] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 324.22it/s, accuracy=0.873, cost=0.451]
train minibatch loop:   0%|          | 3/5604 [00:00<03:54, 23.88it/s, accuracy=0.85, cost=0.471] 

epoch: 6, pass acc: 0.818671, current acc: 0.819837
time taken: 237.73154592514038
epoch: 6, training loss: 0.501388, training acc: 0.831641, valid loss: 0.524694, valid acc: 0.819837



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.03it/s, accuracy=0.75, cost=0.473] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 327.92it/s, accuracy=0.855, cost=0.454]
train minibatch loop:   0%|          | 3/5604 [00:00<03:54, 23.92it/s, accuracy=0.85, cost=0.473] 

time taken: 237.7102997303009
epoch: 7, training loss: 0.490820, training acc: 0.834229, valid loss: 0.519509, valid acc: 0.819788



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.06it/s, accuracy=0.75, cost=0.462] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 327.81it/s, accuracy=0.855, cost=0.457]
train minibatch loop:   0%|          | 3/5604 [00:00<03:53, 24.01it/s, accuracy=0.85, cost=0.475] 

time taken: 237.7209813594818
epoch: 8, training loss: 0.482330, training acc: 0.836144, valid loss: 0.515958, valid acc: 0.819728



train minibatch loop: 100%|██████████| 5604/5604 [03:53<00:00, 24.07it/s, accuracy=0.75, cost=0.452] 
test minibatch loop: 100%|██████████| 1401/1401 [00:04<00:00, 328.67it/s, accuracy=0.855, cost=0.459]

time taken: 237.59545421600342
epoch: 9, training loss: 0.475276, training acc: 0.837860, valid loss: 0.513551, valid acc: 0.819514

break epoch:10






In [13]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, test_X.shape[0], batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = convert_sparse_matrix_to_sparse_tensor(test_X[i : min(i + batch_size, test_X.shape[0])])
    batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
    predict_Y += np.argmax(
        sess.run(
            model.logits, feed_dict = {model.X: batch_x[0], model.W: batch_x[1], model.Y: batch_y}
        ),
        1,
    ).tolist()
    real_Y += batch_y


validation minibatch loop:   0%|          | 0/1401 [00:00<?, ?it/s][A
validation minibatch loop:   2%|▏         | 34/1401 [00:00<00:04, 334.49it/s][A
validation minibatch loop:   6%|▌         | 87/1401 [00:00<00:03, 375.88it/s][A
validation minibatch loop:  10%|█         | 141/1401 [00:00<00:03, 411.93it/s][A
validation minibatch loop:  14%|█▍        | 195/1401 [00:00<00:02, 442.18it/s][A
validation minibatch loop:  18%|█▊        | 248/1401 [00:00<00:02, 464.12it/s][A
validation minibatch loop:  22%|██▏       | 302/1401 [00:00<00:02, 482.69it/s][A
validation minibatch loop:  25%|██▌       | 355/1401 [00:00<00:02, 495.09it/s][A
validation minibatch loop:  29%|██▉       | 408/1401 [00:00<00:01, 504.48it/s][A
validation minibatch loop:  33%|███▎      | 461/1401 [00:00<00:01, 510.29it/s][A
validation minibatch loop:  37%|███▋      | 515/1401 [00:01<00:01, 516.11it/s][A
validation minibatch loop:  41%|████      | 568/1401 [00:01<00:01, 518.58it/s][A
validation minibatch loop:  

In [14]:
print(
    metrics.classification_report(
        real_Y, predict_Y, target_names = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
    )
)

             precision    recall  f1-score   support

      anger       0.89      0.90      0.90     15061
       fear       0.83      0.83      0.83      7552
        joy       0.87      0.89      0.88     16575
       love       0.89      0.90      0.90     15635
    sadness       0.72      0.78      0.75     19640
   surprise       0.67      0.53      0.59      9592

avg / total       0.82      0.82      0.82     84055



In [15]:
saver.save(sess, 'fast-text-char/model.ckpt')

'fast-text-char/model.ckpt'

In [16]:
import pickle
with open('vectorizer-sparse-emotion.pkl','wb') as fopen:
    pickle.dump(bow_chars, fopen)