In [1]:
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from nltk.util import ngrams
from tqdm import tqdm
import time



In [2]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [3]:
def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = ' '.join(
        [i for i in re.findall('[\\w\']+|[;:\-\(\)&.,!?"]', string) if len(i)]
    )
    string = string.lower().split()
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, UNK = 3):
    X = []
    for sentence in corpus:
        X.append([dic[w] if w in dic else UNK for w in sentence.split()[:maxlen]])
    return X

In [4]:
def create_ngram_set(input_list, ngram_value):
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def build_ngram(x_train):
    global max_features
    ngram_set = set()
    for input_list in tqdm(x_train, total = len(x_train), ncols = 70):
        for i in range(2, 3):
            set_of_ngram = create_ngram_set(input_list, ngram_value = i)
            ngram_set.update(set_of_ngram)
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
    return token_indice


def add_ngram(sequences, token_indice):
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i : i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)
    return new_sequences

In [5]:
df = pd.read_csv('dataset/sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
df.head()

Unnamed: 0,label,text
0,Negative,Lebih-lebih lagi dengan kemudahan internet da...
1,Positive,boleh memberi teguran kepada parti tetapi perl...
2,Negative,Adalah membingungkan mengapa masyarakat Cina b...
3,Positive,Kami menurunkan defisit daripada 6.7 peratus p...
4,Negative,"Ini masalahnya. Bukan rakyat, tetapi sistem"


In [6]:
with open('dataset/polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('dataset/polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [7]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])

In [8]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 13325
Most common words [('yang', 14899), ('tidak', 4588), ('untuk', 4038), ('filem', 3698), ('deng', 3350), ('ada', 3190)]
Sample data [1343, 196, 178, 98, 98, 126, 351, 4, 90, 210] ['ringkas', 'bodoh', 'bosan', 'kanak', 'kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [9]:
ngram_range = 2
max_features = len(dictionary)
maxlen = 80
batch_size = 32
embedded_size = 256

In [10]:
idx_trainset = str_idx(texts, dictionary, maxlen)

In [11]:
token_indice = build_ngram(idx_trainset)
X = add_ngram(idx_trainset, token_indice)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen)

100%|███████████████████████| 14279/14279 [00:00<00:00, 196651.04it/s]


In [12]:
train_X, test_X, train_Y, test_Y = train_test_split(X, 
                                                    labels,
                                                    test_size = 0.2)

In [13]:
class Model:
    def __init__(
        self, embedded_size, dict_size, dimension_output, learning_rate
    ):

        self.X = tf.placeholder(tf.int32, [None, None])
        self.Y = tf.placeholder(tf.int32, [None])
        encoder_embeddings = tf.Variable(
            tf.random_uniform([dict_size, embedded_size], -1, 1)
        )
        encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, self.X)
        self.logits = tf.identity(
            tf.layers.dense(
                tf.reduce_mean(encoder_embedded, 1), dimension_output
            ),
            name = 'logits',
        )
        self.cost = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits = self.logits, labels = self.Y
            )
        )
        self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost
        )
        correct_pred = tf.equal(
            tf.argmax(self.logits, 1, output_type = tf.int32), self.Y
        )
        self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


In [14]:
tf.reset_default_graph()
sess = tf.InteractiveSession()
model = Model(embedded_size, max_features, 2, 5e-4)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'fast-text/model.ckpt')

'fast-text/model.ckpt'

In [26]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
    ]
)

In [27]:
strings.split(',')

['Placeholder',
 'Placeholder_1',
 'Variable',
 'dense/kernel',
 'dense/bias',
 'logits']

In [17]:
tf.trainable_variables()

[<tf.Variable 'Variable:0' shape=(129075, 256) dtype=float32_ref>,
 <tf.Variable 'dense/kernel:0' shape=(256, 2) dtype=float32_ref>,
 <tf.Variable 'dense/bias:0' shape=(2,) dtype=float32_ref>]

In [18]:
EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 5, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = train_X[i : min(i + batch_size, train_X.shape[0])]
        batch_y = train_Y[i : min(i + batch_size, train_X.shape[0])]
        acc, cost, _ = sess.run(
            [model.accuracy, model.cost, model.optimizer],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = test_X[i : min(i + batch_size, test_X.shape[0])]
        batch_y = test_Y[i : min(i + batch_size, test_X.shape[0])]
        batch_x_expand = np.expand_dims(batch_x,axis = 1)
        acc, cost = sess.run(
            [model.accuracy, model.cost],
            feed_dict = {
                model.X: batch_x,
                model.Y: batch_y
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

saver.save(sess, "fast-text/model.ckpt")

train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.75it/s, accuracy=0.613, cost=0.679]
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 473.03it/s, accuracy=0.5, cost=0.661]  
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.37it/s, accuracy=0.594, cost=0.678]

epoch: 0, pass acc: 0.000000, current acc: 0.618697
time taken: 13.058606386184692
epoch: 0, training loss: 0.684708, training acc: 0.559626, valid loss: 0.685112, valid acc: 0.618697



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.97it/s, accuracy=0.71, cost=0.638] 
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 480.48it/s, accuracy=0.625, cost=0.638]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.19it/s, accuracy=0.625, cost=0.647]

epoch: 1, pass acc: 0.618697, current acc: 0.671569
time taken: 12.930449724197388
epoch: 1, training loss: 0.650780, training acc: 0.665474, valid loss: 0.665154, valid acc: 0.671569



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.97it/s, accuracy=0.839, cost=0.576]
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 508.81it/s, accuracy=0.75, cost=0.605] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.02it/s, accuracy=0.688, cost=0.595]

epoch: 2, pass acc: 0.671569, current acc: 0.697479
time taken: 12.931367635726929
epoch: 2, training loss: 0.599222, training acc: 0.759419, valid loss: 0.638151, valid acc: 0.697479



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.09it/s, accuracy=0.871, cost=0.496]
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 459.92it/s, accuracy=0.75, cost=0.566] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.34it/s, accuracy=0.875, cost=0.524]

epoch: 3, pass acc: 0.697479, current acc: 0.706583
time taken: 12.9458749294281
epoch: 3, training loss: 0.528589, training acc: 0.828142, valid loss: 0.610143, valid acc: 0.706583



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.84it/s, accuracy=0.935, cost=0.41] 
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 486.81it/s, accuracy=0.75, cost=0.525] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.28it/s, accuracy=0.938, cost=0.443]

epoch: 4, pass acc: 0.706583, current acc: 0.721639
time taken: 12.915259838104248
epoch: 4, training loss: 0.449895, training acc: 0.876997, valid loss: 0.587408, valid acc: 0.721639



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.01it/s, accuracy=1, cost=0.329]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 451.63it/s, accuracy=0.75, cost=0.486] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.21it/s, accuracy=0.969, cost=0.364]

epoch: 5, pass acc: 0.721639, current acc: 0.727241
time taken: 12.944233417510986
epoch: 5, training loss: 0.374340, training acc: 0.911319, valid loss: 0.571401, valid acc: 0.727241



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.08it/s, accuracy=1, cost=0.26]     
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 480.98it/s, accuracy=0.75, cost=0.452] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.08it/s, accuracy=1, cost=0.292]    

epoch: 6, pass acc: 0.727241, current acc: 0.730392
time taken: 12.933130741119385
epoch: 6, training loss: 0.307651, training acc: 0.934956, valid loss: 0.561384, valid acc: 0.730392



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.23it/s, accuracy=1, cost=0.203]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 423.80it/s, accuracy=0.875, cost=0.423]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.25it/s, accuracy=1, cost=0.233]    

epoch: 7, pass acc: 0.730392, current acc: 0.732493
time taken: 12.946501016616821
epoch: 7, training loss: 0.251594, training acc: 0.955003, valid loss: 0.556222, valid acc: 0.732493



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.05it/s, accuracy=1, cost=0.159]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 408.76it/s, accuracy=0.875, cost=0.401]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.06it/s, accuracy=1, cost=0.186]    

epoch: 8, pass acc: 0.732493, current acc: 0.734244
time taken: 12.982598066329956
epoch: 8, training loss: 0.205762, training acc: 0.967959, valid loss: 0.554813, valid acc: 0.734244



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.02it/s, accuracy=1, cost=0.125]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 416.58it/s, accuracy=0.75, cost=0.384] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.16it/s, accuracy=1, cost=0.149]   

time taken: 12.974910020828247
epoch: 9, training loss: 0.168799, training acc: 0.976976, valid loss: 0.556227, valid acc: 0.732493



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.96it/s, accuracy=1, cost=0.0985]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 462.13it/s, accuracy=0.75, cost=0.372] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.19it/s, accuracy=1, cost=0.121]   

time taken: 12.919354677200317
epoch: 10, training loss: 0.139141, training acc: 0.982491, valid loss: 0.559732, valid acc: 0.734244



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.19it/s, accuracy=1, cost=0.0787]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 480.16it/s, accuracy=0.75, cost=0.362] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.15it/s, accuracy=1, cost=0.0984]   

time taken: 12.946474313735962
epoch: 11, training loss: 0.115342, training acc: 0.986431, valid loss: 0.564780, valid acc: 0.733543



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.08it/s, accuracy=1, cost=0.0635]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 509.44it/s, accuracy=0.75, cost=0.355] 
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 27.85it/s, accuracy=1, cost=0.0811]

epoch: 12, pass acc: 0.734244, current acc: 0.734594
time taken: 12.918270587921143
epoch: 12, training loss: 0.096186, training acc: 0.989407, valid loss: 0.570980, valid acc: 0.734594



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.14it/s, accuracy=1, cost=0.0517]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 471.92it/s, accuracy=0.75, cost=0.35]  
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.45it/s, accuracy=1, cost=0.0675]

time taken: 12.941154479980469
epoch: 13, training loss: 0.080692, training acc: 0.991946, valid loss: 0.578069, valid acc: 0.733894



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.07it/s, accuracy=1, cost=0.0424]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 422.17it/s, accuracy=0.875, cost=0.346]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.39it/s, accuracy=1, cost=0.0567]

epoch: 14, pass acc: 0.734594, current acc: 0.736345
time taken: 12.950959205627441
epoch: 14, training loss: 0.068087, training acc: 0.993609, valid loss: 0.585878, valid acc: 0.736345



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.15it/s, accuracy=1, cost=0.0351]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 488.96it/s, accuracy=0.875, cost=0.344]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.05it/s, accuracy=1, cost=0.048] 

time taken: 12.926128149032593
epoch: 15, training loss: 0.057771, training acc: 0.995098, valid loss: 0.594289, valid acc: 0.736345



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.06it/s, accuracy=1, cost=0.0291]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 490.58it/s, accuracy=0.875, cost=0.342]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 27.70it/s, accuracy=1, cost=0.0409]

epoch: 16, pass acc: 0.736345, current acc: 0.738445
time taken: 12.923779964447021
epoch: 16, training loss: 0.049276, training acc: 0.996148, valid loss: 0.603219, valid acc: 0.738445



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.08it/s, accuracy=1, cost=0.0243]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 462.14it/s, accuracy=0.875, cost=0.342]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 27.78it/s, accuracy=1, cost=0.0351]

time taken: 12.94113302230835
epoch: 17, training loss: 0.042238, training acc: 0.997024, valid loss: 0.612601, valid acc: 0.737745



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.69it/s, accuracy=1, cost=0.0204]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 494.72it/s, accuracy=0.875, cost=0.343]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.07it/s, accuracy=1, cost=0.0303]

epoch: 18, pass acc: 0.738445, current acc: 0.739846
time taken: 12.922403812408447
epoch: 18, training loss: 0.036372, training acc: 0.997286, valid loss: 0.622383, valid acc: 0.739846



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.73it/s, accuracy=1, cost=0.0172]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 499.13it/s, accuracy=0.875, cost=0.346]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.05it/s, accuracy=1, cost=0.0262]

time taken: 12.930727005004883
epoch: 19, training loss: 0.031456, training acc: 0.997549, valid loss: 0.632524, valid acc: 0.739496



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 27.85it/s, accuracy=1, cost=0.0145]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 485.74it/s, accuracy=0.875, cost=0.348]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.31it/s, accuracy=1, cost=0.0228]

time taken: 12.936310291290283
epoch: 20, training loss: 0.027314, training acc: 0.997899, valid loss: 0.642995, valid acc: 0.737045



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.12it/s, accuracy=1, cost=0.0123]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 471.60it/s, accuracy=0.875, cost=0.352]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 27.84it/s, accuracy=1, cost=0.0199]

time taken: 12.928076267242432
epoch: 21, training loss: 0.023805, training acc: 0.998249, valid loss: 0.653774, valid acc: 0.735994



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.17it/s, accuracy=1, cost=0.0105]    
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 482.40it/s, accuracy=0.875, cost=0.357]
train minibatch loop:   1%|          | 3/357 [00:00<00:12, 28.22it/s, accuracy=1, cost=0.0174]

time taken: 12.905269861221313
epoch: 22, training loss: 0.020820, training acc: 0.998424, valid loss: 0.664847, valid acc: 0.737045



train minibatch loop: 100%|██████████| 357/357 [00:12<00:00, 28.17it/s, accuracy=1, cost=0.00891]   
test minibatch loop: 100%|██████████| 90/90 [00:00<00:00, 479.03it/s, accuracy=0.875, cost=0.362]


time taken: 12.916682958602905
epoch: 23, training loss: 0.018268, training acc: 0.998687, valid loss: 0.676207, valid acc: 0.736695

break epoch:24



'fast-text/model.ckpt'

In [19]:
logits = sess.run(model.logits, feed_dict = {model.X: test_X})
print(
    metrics.classification_report(
        test_Y, np.argmax(logits, 1), target_names = ['negative', 'positive']
    )
)

             precision    recall  f1-score   support

   negative       0.69      0.71      0.70      1261
   positive       0.77      0.74      0.75      1595

avg / total       0.73      0.73      0.73      2856



In [20]:
text = 'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
new_vector = add_ngram(str_idx([classification_textcleaning(text)], dictionary), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.99185514, 0.00814484]], dtype=float32)

In [21]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya'
new_vector = add_ngram(str_idx([classification_textcleaning(text)], dictionary), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[4.792639e-07, 9.999995e-01]], dtype=float32)

In [22]:
text = 'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
new_vector = add_ngram(str_idx([classification_textcleaning(text)],dictionary), token_indice)
sess.run(tf.nn.softmax(model.logits), feed_dict={model.X:new_vector})

array([[0.97588885, 0.02411108]], dtype=float32)

In [23]:
import json
with open('fast-text-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [24]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [28]:
freeze_graph('fast-text', strings)

INFO:tensorflow:Restoring parameters from fast-text/model.ckpt
INFO:tensorflow:Froze 3 variables.
INFO:tensorflow:Converted 3 variables to const ops.
16 ops in the final graph.


In [29]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [31]:
g = load_graph('fast-text/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
logits = g.get_tensor_by_name('import/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(tf.nn.softmax(logits), feed_dict = {x: new_vector})



array([[0.97588885, 0.02411108]], dtype=float32)

In [33]:
import pickle
with open('token-indice.pkl','wb') as fopen:
    pickle.dump(token_indice, fopen)