In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time

In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [(naive_stemmer(word), word) for word in string]
    return (
        ' '.join([word[0] for word in string if len(word[0]) > 1]),
        ' '.join([word[1] for word in string if len(word[0]) > 1]),
    )


def str_idx(corpus, dic, UNK = 3):
    X = []
    for sentence in corpus:
        X.append([dic.get(w,UNK) for w in sentence.split()[:maxlen]])
    return X

def create_ngram_set(input_list, ngram_value):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def build_ngram(x_train):
    global max_features
    ngram_set = set()
    for input_list in tqdm(x_train, total = len(x_train), ncols = 70):
        for i in range(2, 3):
            set_of_ngram = create_ngram_set(input_list, ngram_value = i)
            ngram_set.update(set_of_ngram)
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
    return token_indice


def add_ngram(sequences, token_indice):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i : i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [4]:
with open('subjectivity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('subjectivity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts

assert len(labels) == len(texts)

In [5]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])[0]

In [6]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13222
Most common words [('yang', 11804), ('untuk', 3879), ('tidak', 2898), ('deng', 2827), ('ada', 2294), ('dalam', 2193)]
Sample data [10, 68, 13, 27, 55, 54, 11, 392, 34, 182] ['filem', 'mula', 'pada', 'masa', 'lalu', 'mana', 'orang', 'budak', 'lelaki', 'nama']


In [7]:
ngram_range = 2
max_features = len(dictionary)
maxlen = 80
batch_size = 32
embedded_size = 256

In [8]:
idx_trainset = str_idx(texts, dictionary, maxlen)

In [9]:
token_indice = build_ngram(idx_trainset)
X = add_ngram(idx_trainset, token_indice)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen)

100%|█████████████████████████| 9962/9962 [00:00<00:00, 200836.62it/s]


In [10]:
train_X, test_X, train_Y, test_Y = train_test_split(X, 
                                                    labels,
                                                    test_size = 0.2)

In [11]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, 2, 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [13]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [14]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [15]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(109348, 256) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(256, 2) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(2,) dtype=float32_ref>]

In [16]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1
saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.18it/s, accuracy=0, cost=0.786]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 658.73it/s, accuracy=0.778, cost=0.643]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.91it/s, accuracy=0.875, cost=0.606]

epoch: 0, pass acc: 0.000000, current acc: 0.748564
time taken: 7.632977724075317
epoch: 0, training loss: 0.680961, training acc: 0.622412, valid loss: 0.653781, valid acc: 0.748564



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.67it/s, accuracy=0, cost=0.756]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 696.41it/s, accuracy=0.778, cost=0.6]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.77it/s, accuracy=0.938, cost=0.508]

epoch: 1, pass acc: 0.748564, current acc: 0.832358
time taken: 7.517005920410156
epoch: 1, training loss: 0.603597, training acc: 0.802610, valid loss: 0.583401, valid acc: 0.832358



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.65it/s, accuracy=0, cost=0.698]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 684.95it/s, accuracy=0.778, cost=0.555]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.88it/s, accuracy=0.938, cost=0.394]

epoch: 2, pass acc: 0.832358, current acc: 0.853933
time taken: 7.524259567260742
epoch: 2, training loss: 0.503871, training acc: 0.878529, valid loss: 0.501211, valid acc: 0.853933



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.70it/s, accuracy=1, cost=0.611]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 692.77it/s, accuracy=0.889, cost=0.528]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.58it/s, accuracy=0.938, cost=0.297]

epoch: 3, pass acc: 0.853933, current acc: 0.868763
time taken: 7.511051893234253
epoch: 3, training loss: 0.401316, training acc: 0.914167, valid loss: 0.432543, valid acc: 0.868763



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.67it/s, accuracy=1, cost=0.513]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 687.21it/s, accuracy=0.889, cost=0.519]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.74it/s, accuracy=0.938, cost=0.226]

epoch: 4, pass acc: 0.868763, current acc: 0.875787
time taken: 7.518678188323975
epoch: 4, training loss: 0.318071, training acc: 0.934120, valid loss: 0.384561, valid acc: 0.875787



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.69it/s, accuracy=1, cost=0.417]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 685.18it/s, accuracy=0.778, cost=0.519]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.70it/s, accuracy=0.969, cost=0.175]

epoch: 5, pass acc: 0.875787, current acc: 0.880526
time taken: 7.51415228843689
epoch: 5, training loss: 0.255521, training acc: 0.950809, valid loss: 0.352085, valid acc: 0.880526



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.72it/s, accuracy=1, cost=0.332]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 694.75it/s, accuracy=0.778, cost=0.525]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.74it/s, accuracy=0.969, cost=0.137]

epoch: 6, pass acc: 0.880526, current acc: 0.884039
time taken: 7.50698447227478
epoch: 6, training loss: 0.208035, training acc: 0.962856, valid loss: 0.329528, valid acc: 0.884039



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.64it/s, accuracy=1, cost=0.26]     
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 687.90it/s, accuracy=0.778, cost=0.536]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.55it/s, accuracy=1, cost=0.108]    

epoch: 7, pass acc: 0.884039, current acc: 0.888053
time taken: 7.525402784347534
epoch: 7, training loss: 0.170951, training acc: 0.972895, valid loss: 0.313284, valid acc: 0.888053



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.68it/s, accuracy=1, cost=0.201]     
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 699.15it/s, accuracy=0.778, cost=0.549]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.55it/s, accuracy=1, cost=0.0852]   

epoch: 8, pass acc: 0.888053, current acc: 0.891565
time taken: 7.515627861022949
epoch: 8, training loss: 0.141317, training acc: 0.980801, valid loss: 0.301207, valid acc: 0.891565



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.73it/s, accuracy=1, cost=0.156]     
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 690.38it/s, accuracy=0.778, cost=0.565]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.51it/s, accuracy=1, cost=0.0677]   

epoch: 9, pass acc: 0.891565, current acc: 0.894575
time taken: 7.505678415298462
epoch: 9, training loss: 0.117280, training acc: 0.986698, valid loss: 0.292013, valid acc: 0.894575



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.73it/s, accuracy=1, cost=0.12]      
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 695.93it/s, accuracy=0.778, cost=0.584]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.57it/s, accuracy=1, cost=0.054]    

epoch: 10, pass acc: 0.894575, current acc: 0.898088
time taken: 7.503463268280029
epoch: 10, training loss: 0.097605, training acc: 0.990965, valid loss: 0.284905, valid acc: 0.898088



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.71it/s, accuracy=1, cost=0.0932]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 681.82it/s, accuracy=0.778, cost=0.603]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.69it/s, accuracy=1, cost=0.0432]

epoch: 11, pass acc: 0.898088, current acc: 0.899091
time taken: 7.510956048965454
epoch: 11, training loss: 0.081416, training acc: 0.993726, valid loss: 0.279370, valid acc: 0.899091



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.68it/s, accuracy=1, cost=0.0726]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 676.67it/s, accuracy=0.778, cost=0.625]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.90it/s, accuracy=1, cost=0.0347]

time taken: 7.517028331756592
epoch: 12, training loss: 0.068056, training acc: 0.996235, valid loss: 0.275068, valid acc: 0.898590



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.73it/s, accuracy=1, cost=0.0569]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 634.37it/s, accuracy=0.778, cost=0.648]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.89it/s, accuracy=1, cost=0.0279]

epoch: 13, pass acc: 0.899091, current acc: 0.900597
time taken: 7.513111114501953
epoch: 13, training loss: 0.057012, training acc: 0.997992, valid loss: 0.271764, valid acc: 0.900597



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.74it/s, accuracy=1, cost=0.045]     
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 676.52it/s, accuracy=0.778, cost=0.672]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.89it/s, accuracy=1, cost=0.0226]

time taken: 7.504401206970215
epoch: 14, training loss: 0.047870, training acc: 0.999498, valid loss: 0.269292, valid acc: 0.899091



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.72it/s, accuracy=1, cost=0.0358]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 640.80it/s, accuracy=0.778, cost=0.697]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.91it/s, accuracy=1, cost=0.0184]

time taken: 7.51427149772644
epoch: 15, training loss: 0.040289, training acc: 1.000376, valid loss: 0.267529, valid acc: 0.898088



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.68it/s, accuracy=1, cost=0.0288]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 666.64it/s, accuracy=0.778, cost=0.722]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.62it/s, accuracy=1, cost=0.015] 

time taken: 7.519939184188843
epoch: 16, training loss: 0.033989, training acc: 1.001757, valid loss: 0.266378, valid acc: 0.898590



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.73it/s, accuracy=1, cost=0.0233]    
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 641.40it/s, accuracy=0.778, cost=0.748]
train minibatch loop:   2%|▏         | 4/250 [00:00<00:07, 33.80it/s, accuracy=1, cost=0.0123]

time taken: 7.512571334838867
epoch: 17, training loss: 0.028741, training acc: 1.002008, valid loss: 0.265757, valid acc: 0.897586



train minibatch loop: 100%|██████████| 250/250 [00:07<00:00, 33.69it/s, accuracy=1, cost=0.019]     
test minibatch loop: 100%|██████████| 63/63 [00:00<00:00, 698.68it/s, accuracy=0.778, cost=0.774]


time taken: 7.5115885734558105
epoch: 18, training loss: 0.024360, training acc: 1.002259, valid loss: 0.265591, valid acc: 0.897586

break epoch:19



'fast-text/model.ckpt'

In [17]:
logits = sess.run(model.logits, feed_dict = {model.X: test_X})
print(
    metrics.classification_report(
        test_Y, np.argmax(logits, 1), target_names = ['negative', 'positive']
    )
)

              precision    recall  f1-score   support

    negative       0.88      0.91      0.89      1003
    positive       0.90      0.87      0.89       990

   micro avg       0.89      0.89      0.89      1993
   macro avg       0.89      0.89      0.89      1993
weighted avg       0.89      0.89      0.89      1993



In [18]:
import json
with open('fast-text-subjective.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [19]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [20]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [21]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [24]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
new_vector = add_ngram(str_idx([classification_textcleaning(text)[0]],dictionary), token_indice)
test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})



array([[0.00218716, 0.99781287]], dtype=float32)

In [25]:
import pickle
with open('token-indice.pkl','wb') as fopen:
    pickle.dump(token_indice, fopen)