In [1]:
import bert_model as modeling
import re
import numpy as np
import pandas as pd
import collections
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.model_selection import train_test_split
from unidecode import unidecode
from tqdm import tqdm
import time

In [2]:
rules_normalizer = {
    'experience': 'pengalaman',
    'bagasi': 'bagasi',
    'kg': 'kampung',
    'kilo': 'kilogram',
    'g': 'gram',
    'grm': 'gram',
    'k': 'okay',
    'abgkat': 'abang dekat',
    'abis': 'habis',
    'ade': 'ada',
    'adoi': 'aduh',
    'adoii': 'aduhh',
    'aerodarat': 'kapal darat',
    'agkt': 'angkat',
    'ahh': 'ah',
    'ailior': 'air liur',
    'airasia': 'air asia x',
    'airasiax': 'penerbangan',
    'airline': 'penerbangan',
    'airlines': 'penerbangan',
    'airport': 'lapangan terbang',
    'airpot': 'lapangan terbang',
    'aje': 'sahaja',
    'ajelah': 'sahajalah',
    'ajer': 'sahaja',
    'ak': 'aku',
    'aq': 'aku',
    'all': 'semua',
    'ambik': 'ambil',
    'amek': 'ambil',
    'amer': 'amir',
    'amik': 'ambil',
    'ana': 'saya',
    'angkt': 'angkat',
    'anual': 'tahunan',
    'apapun': 'apa pun',
    'ape': 'apa',
    'arab': 'arab',
    'area': 'kawasan',
    'aritu': 'hari itu',
    'ask': 'tanya',
    'astro': 'astro',
    'at': 'pada',
    'attitude': 'sikap',
    'babi': 'khinzir',
    'back': 'belakang',
    'bag': 'beg',
    'bang': 'abang',
    'bangla': 'bangladesh',
    'banyk': 'banyak',
    'bard': 'pujangga',
    'bargasi': 'bagasi',
    'bawak': 'bawa',
    'bawanges': 'bawang',
    'be': 'jadi',
    'behave': 'berkelakuan baik',
    'belagak': 'berlagak',
    'berdisiplin': 'berdisplin',
    'berenti': 'berhenti',
    'beskal': 'basikal',
    'bff': 'rakan karib',
    'bg': 'bagi',
    'bgi': 'bagi',
    'biase': 'biasa',
    'big': 'besar',
    'bike': 'basikal',
    'bile': 'bila',
    'binawe': 'binatang',
    'bini': 'isteri',
    'bkn': 'bukan',
    'bla': 'bila',
    'blom': 'belum',
    'bnyak': 'banyak',
    'body': 'tubuh',
    'bole': 'boleh',
    'boss': 'bos',
    'bowling': 'boling',
    'bpe': 'berapa',
    'brand': 'jenama',
    'brg': 'barang',
    'briefing': 'taklimat',
    'brng': 'barang',
    'bro': 'abang',
    'bru': 'baru',
    'bruntung': 'beruntung',
    'bsikal': 'basikal',
    'btnggjwb': 'bertanggungjawab',
    'btul': 'betul',
    'buatlh': 'buatlah',
    'buh': 'letak',
    'buka': 'buka',
    'but': 'tetapi',
    'bwk': 'bawa',
    'by': 'dengan',
    'byr': 'bayar',
    'bz': 'sibuk',
    'camera': 'kamera',
    'camni': 'macam ini',
    'cane': 'macam mana',
    'cant': 'tak boleh',
    'carakerja': 'cara kerja',
    'care': 'jaga',
    'cargo': 'kargo',
    'cctv': 'kamera litar tertutup',
    'celako': 'celaka',
    'cer': 'cerita',
    'cheap': 'murah',
    'check': 'semak',
    'ciput': 'sedikit',
    'cite': 'cerita',
    'citer': 'cerita',
    'ckit': 'sikit',
    'ckp': 'cakap',
    'class': 'kelas',
    'cm': 'macam',
    'cmni': 'macam ini',
    'cmpak': 'campak',
    'committed': 'komited',
    'company': 'syarikat',
    'complain': 'aduan',
    'corn': 'jagung',
    'couldnt': 'tak boleh',
    'cr': 'cari',
    'crew': 'krew',
    'cube': 'cuba',
    'cuma': 'cuma',
    'curinyaa': 'curinya',
    'cust': 'pelanggan',
    'customer': 'pelanggan',
    'd': 'di',
    'da': 'dah',
    'dn': 'dan',
    'dahh': 'dah',
    'damaged': 'rosak',
    'dapek': 'dapat',
    'day': 'hari',
    'dazrin': 'dazrin',
    'dbalingnya': 'dibalingnya',
    'de': 'ada',
    'deep': 'dalam',
    'deliberately': 'sengaja',
    'depa': 'mereka',
    'dessa': 'desa',
    'dgn': 'dengan',
    'dh': 'dah',
    'didunia': 'di dunia',
    'diorang': 'mereka',
    'diorng': 'mereka',
    'direct': 'secara terus',
    'diving': 'junam',
    'dkt': 'dekat',
    'dlempar': 'dilempar',
    'dlm': 'dalam',
    'dlt': 'padam',
    'dlu': 'dulu',
    'done': 'siap',
    'dont': 'jangan',
    'dorg': 'mereka',
    'dpermudhkn': 'dipermudahkan',
    'dpt': 'dapat',
    'dr': 'dari',
    'dri': 'dari',
    'dsb': 'dan sebagainya',
    'dy': 'dia',
    'educate': 'mendidik',
    'ensure': 'memastikan',
    'everything': 'semua',
    'ewahh': 'wah',
    'expect': 'sangka',
    'fb': 'facebook',
    'fired': 'pecat',
    'first': 'pertama',
    'fkr': 'fikir',
    'flight': 'kapal terbang',
    'for': 'untuk',
    'free': 'percuma',
    'friend': 'kawan',
    'fyi': 'untuk pengetahuan anda',
    'gantila': 'gantilah',
    'gantirugi': 'ganti rugi',
    'gentlemen': 'lelaki budiman',
    'gerenti': 'jaminan',
    'gile': 'gila',
    'gk': 'juga',
    'gnti': 'ganti',
    'go': 'pergi',
    'gomen': 'kerajaan',
    'goment': 'kerajaan',
    'good': 'baik',
    'ground': 'tanah',
    'guarno': 'macam mana',
    'hampa': 'mereka',
    'hampeh': 'teruk',
    'hanat': 'jahanam',
    'handle': 'kawal',
    'handling': 'kawalan',
    'hanta': 'hantar',
    'haritu': 'hari itu',
    'hate': 'benci',
    'have': 'ada',
    'hawau': 'celaka',
    'henpon': 'telefon',
    'heran': 'hairan',
    'him': 'dia',
    'his': 'dia',
    'hmpa': 'mereka',
    'hntr': 'hantar',
    'hotak': 'otak',
    'hr': 'hari',
    'i': 'saya',
    'hrga': 'harga',
    'hrp': 'harap',
    'hu': 'sedih',
    'humble': 'merendah diri',
    'ibon': 'ikon',
    'ichi': 'inci',
    'idung': 'hidung',
    'if': 'jika',
    'ig': 'instagram',
    'iklas': 'ikhlas',
    'improve': 'menambah baik',
    'in': 'masuk',
    'isn t': 'tidak',
    'isyaallah': 'insyallah',
    'ja': 'sahaja',
    'japan': 'jepun',
    'jd': 'jadi',
    'je': 'saja',
    'jee': 'saja',
    'jek': 'saja',
    'jepun': 'jepun',
    'jer': 'saja',
    'jerr': 'saja',
    'jez': 'saja',
    'jg': 'juga',
    'jgk': 'juga',
    'jgn': 'jangan',
    'jgnla': 'janganlah',
    'jibake': 'celaka',
    'jjur': 'jujur',
    'job': 'kerja',
    'jobscope': 'skop kerja',
    'jogja': 'jogjakarta',
    'jpam': 'jpam',
    'jth': 'jatuh',
    'jugak': 'juga',
    'ka': 'ke',
    'kalo': 'kalau',
    'kalu': 'kalau',
    'kang': 'nanti',
    'kantoi': 'temberang',
    'kasi': 'beri',
    'kat': 'dekat',
    'kbye': 'ok bye',
    'kearah': 'ke arah',
    'kecik': 'kecil',
    'keja': 'kerja',
    'keje': 'kerja',
    'kejo': 'kerja',
    'keksongan': 'kekosongan',
    'kemana': 'ke mana',
    'kene': 'kena',
    'kenekan': 'kenakan',
    'kesah': 'kisah',
    'ketempat': 'ke tempat',
    'kije': 'kerja',
    'kijo': 'kerja',
    'kiss': 'cium',
    'kite': 'kita',
    'kito': 'kita',
    'kje': 'kerja',
    'kjr': 'kerja',
    'kk': 'okay',
    'kmi': 'kami',
    'kt': 'kat',
    'tlg': 'tolong',
    'kl': 'kuala lumpur',
    'klai': 'kalau',
    'klau': 'kalau',
    'klia': 'klia',
    'klo': 'kalau',
    'klu': 'kalau',
    'kn': 'kan',
    'knapa': 'kenapa',
    'kne': 'kena',
    'ko': 'kau',
    'kompom': 'sah',
    'korang': 'kamu semua',
    'korea': 'korea',
    'korg': 'kamu semua',
    'kot': 'mungkin',
    'krja': 'kerja',
    'ksalahan': 'kesalahan',
    'kta': 'kita',
    'kuar': 'keluar',
    'kut': 'mungkin',
    'la': 'lah',
    'laa': 'lah',
    'lahabau': 'celaka',
    'lahanat': 'celaka',
    'lainda': 'lain dah',
    'lak': 'pula',
    'last': 'akhir',
    'le': 'lah',
    'leader': 'ketua',
    'leave': 'pergi',
    'ler': 'lah',
    'less': 'kurang',
    'letter': 'surat',
    'lg': 'lagi',
    'lgi': 'lagi',
    'lngsong': 'langsung',
    'lol': 'hehe',
    'lorr': 'lah',
    'low': 'rendah',
    'lps': 'lepas',
    'luggage': 'bagasi',
    'lumbe': 'lumba',
    'lyak': 'layak',
    'maap': 'maaf',
    'maapkan': 'maafkan',
    'mahai': 'mahal',
    'mampos': 'mampus',
    'mart': 'kedai',
    'mau': 'mahu',
    'mcm': 'macam',
    'mcmtu': 'macam itu',
    'memerlukn': 'memerlukan',
    'mengembirakan': 'menggembirakan',
    'mengmbilnyer': 'mengambilnya',
    'mengtasi': 'mengatasi',
    'mg': 'memang',
    'mihak': 'memihak',
    'min': 'admin',
    'mingu': 'minggu',
    'mintak': 'minta',
    'mjtuhkn': 'menjatuhkan',
    'mkyong': 'mak yong',
    'mlibatkn': 'melibatkan',
    'mmg': 'memang',
    'mmnjang': 'memanjang',
    'mmpos': 'mampus',
    'mn': 'mana',
    'mna': 'mana',
    'mntak': 'minta',
    'mntk': 'minta',
    'mnyusun': 'menyusun',
    'mood': 'suasana',
    'most': 'paling',
    'mr': 'tuan',
    'msa': 'masa',
    'msia': 'malaysia',
    'mst': 'mesti',
    'mu': 'awak',
    'much': 'banyak',
    'muko': 'muka',
    'mum': 'emak',
    'n': 'dan',
    'nah': 'nah',
    'nanny': 'nenek',
    'napo': 'kenapa',
    'nati': 'nanti',
    'ngan': 'dengan',
    'ngn': 'dengan',
    'ni': 'ini',
    'nie': 'ini',
    'nii': 'ini',
    'nk': 'nak',
    'nmpk': 'nampak',
    'nye': 'nya',
    'ofis': 'pejabat',
    'ohh': 'oh',
    'oii': 'hoi',
    'one': 'satu',
    'online': 'dalam talian',
    'or': 'atau',
    'org': 'orang',
    'orng': 'orang',
    'otek': 'otak',
    'p': 'pergi',
    'paid': 'dah bayar',
    'palabana': 'kepala otak',
    'pasni': 'lepas ini',
    'passengers': 'penumpang',
    'passengger': 'penumpang',
    'pastu': 'lepas itu',
    'pd': 'pada',
    'pegi': 'pergi',
    'pekerje': 'pekerja',
    'pekrja': 'pekerja',
    'perabih': 'perabis',
    'perkerja': 'pekerja',
    'pg': 'pergi',
    'phuii': 'puih',
    'pikir': 'fikir',
    'pilot': 'juruterbang',
    'pk': 'fikir',
    'pkerja': 'pekerja',
    'pkerjaan': 'pekerjaan',
    'pki': 'pakai',
    'please': 'tolong',
    'pls': 'tolong',
    'pn': 'pun',
    'pnh': 'pernah',
    'pnt': 'penat',
    'pnya': 'punya',
    'pon': 'pun',
    'priority': 'keutamaan',
    'properties': 'harta benda',
    'ptugas': 'petugas',
    'pub': 'kelab malam',
    'pulak': 'pula',
    'puye': 'punya',
    'pwrcuma': 'percuma',
    'pyahnya': 'payahnya',
    'quality': 'kualiti',
    'quit': 'keluar',
    'ramly': 'ramly',
    'rege': 'harga',
    'reger': 'harga',
    'report': 'laporan',
    'resigned': 'meletakkan jawatan',
    'respect': 'hormat',
    'rizal': 'rizal',
    'rosak': 'rosak',
    'rosok': 'rosak',
    'rse': 'rasa',
    'sacked': 'buang',
    'sado': 'tegap',
    'salute': 'sanjung',
    'sam': 'sama',
    'same': 'sama',
    'samp': 'sampah',
    'sbb': 'sebab',
    'sbgai': 'sebagai',
    'sblm': 'sebelum',
    'sblum': 'sebelum',
    'sbnarnya': 'sebenarnya',
    'sbum': 'sebelum',
    'sdg': 'sedang',
    'sebb': 'sebab',
    'sebijik': 'sebiji',
    'see': 'lihat',
    'seen': 'dilihat',
    'selangor': 'selangor',
    'selfie': 'swafoto',
    'sempoi': 'cantik',
    'senaraihitam': 'senarai hitam',
    'seorg': 'seorang',
    'service': 'perkhidmatan',
    'sgt': 'sangat',
    'shared': 'kongsi',
    'shirt': 'kemeja',
    'shut': 'tutup',
    'sib': 'nasib',
    'skali': 'sekali',
    'sket': 'sikit',
    'sma': 'sama',
    'smoga': 'semoga',
    'smpoi': 'cantik',
    'sndiri': 'sendiri',
    'sndr': 'sendiri',
    'sndri': 'sendiri',
    'sne': 'sana',
    'so': 'jadi',
    'sop': 'tatacara pengendalian piawai',
    'sorang': 'seorang',
    'spoting': 'pembintikan',
    'sronok': 'seronok',
    'ssh': 'susah',
    'staff': 'staf',
    'standing': 'berdiri',
    'start': 'mula',
    'steady': 'mantap',
    'stiap': 'setiap',
    'stress': 'stres',
    'student': 'pelajar',
    'study': 'belajar',
    'studycase': 'kajian kes',
    'sure': 'pasti',
    'sykt': 'syarikat',
    'tah': 'entah',
    'taik': 'tahi',
    'takan': 'tak akan',
    'takat': 'setakat',
    'takde': 'tak ada',
    'takkan': 'tak akan',
    'taknak': 'tak nak',
    'tang': 'tentang',
    'tanggungjawab': 'bertanggungjawab',
    'taraa': 'sementara',
    'tau': 'tahu',
    'tbabit': 'terbabit',
    'team': 'pasukan',
    'terbaekk': 'terbaik',
    'teruknye': 'teruknya',
    'tgk': 'tengok',
    'that': 'itu',
    'thinking': 'fikir',
    'those': 'itu',
    'time': 'masa',
    'tk': 'tak',
    'tnggongjwb': 'tanggungjawab',
    'tngok': 'tengok',
    'tngu': 'tunggu',
    'to': 'kepada',
    'tosak': 'rosak',
    'tp': 'tapi',
    'tpi': 'tapi',
    'tpon': 'telefon',
    'transfer': 'pindah',
    'trgelak': 'tergelak',
    'ts': 'tan sri',
    'tstony': 'tan sri tony',
    'tu': 'itu',
    'tuh': 'itu',
    'tula': 'itulah',
    'umeno': 'umno',
    'unfortunately': 'malangnya',
    'unhappy': 'tidak gembira',
    'up': 'naik',
    'upkan': 'naikkan',
    'ur': 'awak',
    'utk': 'untuk',
    'very': 'sangat',
    'viral': 'tular',
    'vote': 'undi',
    'warning': 'amaran',
    'warranty': 'waranti',
    'wassap': 'whatsapp',
    'wat': 'apa',
    'weii': 'wei',
    'well': 'maklumlah',
    'win': 'menang',
    'with': 'dengan',
    'wt': 'buat',
    'x': 'tak',
    'tw': 'tahu',
    'ye': 'ya',
    'yee': 'ya',
    'yg': 'yang',
    'yng': 'yang',
    'you': 'awak',
    'your': 'awak',
    'sakai': 'selekeh',
    'rmb': 'billion ringgit',
    'rmj': 'juta ringgit',
    'rmk': 'ribu ringgit',
    'rm': 'ringgit',
}

In [3]:
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = [e for e in hujung if word.endswith(e)]
    if len(hujung_result):
        hujung_result = max(hujung_result, key = len)
        if len(hujung_result):
            word = word[: -len(hujung_result)]
    permulaan_result = [e for e in permulaan if word.startswith(e)]
    if len(permulaan_result):
        permulaan_result = max(permulaan_result, key = len)
        if len(permulaan_result):
            word = word[len(permulaan_result) :]
    return word

def build_dataset(words, n_words):
    count = [['GO', 0], ['PAD', 1], ['EOS', 2], ['UNK', 3]]
    counter = collections.Counter(words).most_common(n_words)
    count.extend(counter)
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
    data = list()
    unk_count = 0
    for word in words:
        index = dictionary.get(word, 3)
        if index == 0:
            unk_count += 1
        data.append(index)
    count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return data, count, dictionary, reversed_dictionary


def classification_textcleaning(string):
    string = re.sub(
        'http\S+|www.\S+',
        '',
        ' '.join(
            [i for i in string.split() if i.find('#') < 0 and i.find('@') < 0]
        ),
    )
    string = unidecode(string).replace('.', ' . ').replace(',', ' , ')
    string = re.sub('[^A-Za-z ]+', ' ', string)
    string = re.sub(r'[ ]+', ' ', string.lower()).strip()
    string = [rules_normalizer.get(w, w) for w in string.split()]
    string = [naive_stemmer(word) for word in string]
    return ' '.join([word for word in string if len(word) > 1])


def str_idx(corpus, dic, maxlen, UNK = 3):
    X = np.zeros((len(corpus), maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            X[i, -1 - no] = dic.get(k, UNK)
    return X

In [4]:
classification_textcleaning('kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya')

'raja benar sangat benci rakyat minyak naik gala'

In [5]:
df = pd.read_csv('sentiment-data-v2.csv')
Y = LabelEncoder().fit_transform(df.label)
with open('polarity-negative-translated.txt','r') as fopen:
    texts = fopen.read().split('\n')
labels = [0] * len(texts)

with open('polarity-positive-translated.txt','r') as fopen:
    positive_texts = fopen.read().split('\n')
labels += [1] * len(positive_texts)
texts += positive_texts
texts += df.iloc[:,1].tolist()
labels += Y.tolist()

assert len(labels) == len(texts)

In [6]:
import json
with open('bm-amazon.json') as fopen:
    amazon = json.load(fopen)
    
with open('bm-imdb.json') as fopen:
    imdb = json.load(fopen)
    
with open('bm-yelp.json') as fopen:
    yelp = json.load(fopen)
    
texts += amazon['negative']
labels += [0] * len(amazon['negative'])
texts += amazon['positive']
labels += [1] * len(amazon['positive'])

texts += imdb['negative']
labels += [0] * len(imdb['negative'])
texts += imdb['positive']
labels += [1] * len(imdb['positive'])

texts += yelp['negative']
labels += [0] * len(yelp['negative'])
texts += yelp['positive']
labels += [1] * len(yelp['positive'])

In [7]:
import os
for i in [i for i in os.listdir('negative') if 'Store' not in i]:
    with open('negative/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [0] * len(a)

In [8]:
import os
for i in [i for i in os.listdir('positive') if 'Store' not in i]:
    with open('positive/'+i) as fopen:
        a = json.load(fopen)
        texts += a
        labels += [1] * len(a)

In [9]:
for i in range(len(texts)):
    texts[i] = classification_textcleaning(texts[i])

In [10]:
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 120097
Most common words [('saya', 533028), ('yang', 204446), ('tidak', 164296), ('untuk', 129707), ('anda', 126091), ('hari', 88975)]
Sample data [2667, 229, 363, 235, 235, 94, 1357, 5, 78, 678] ['ringkas', 'bodoh', 'bosan', 'kanak', 'kanak', 'lelaki', 'remaja', 'yang', 'begitu', 'muda']


In [11]:
size_layer = 256
num_layers = 2
embedded_size = 256
dimension_output = len(np.unique(labels))
learning_rate = 5e-4
maxlen = 80
batch_size = 32

In [12]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

bert_config = modeling.BertConfig(
    vocab_size = len(dictionary),
    hidden_size = size_layer,
    num_hidden_layers = num_layers,
    num_attention_heads = size_layer // 4,
    intermediate_size = size_layer * 2,
)

input_ids = tf.placeholder(tf.int32, [None, maxlen], name = 'Placeholder_input_ids')
input_mask = tf.placeholder(tf.int32, [None, maxlen], name = 'Placeholder_input_mask')
segment_ids = tf.placeholder(tf.int32, [None, maxlen], name = 'Placeholder_segment_ids')
label_ids = tf.placeholder(tf.int32, [None], name = 'Placeholder_label_ids')
is_training = tf.placeholder(tf.bool, name = 'Placeholder_is_training')

In [13]:
def create_model(
    bert_config,
    is_training,
    input_ids,
    input_mask,
    segment_ids,
    labels,
    num_labels,
    use_one_hot_embeddings,
    reuse_flag = False,
):
    model = modeling.BertModel(
        config = bert_config,
        is_training = is_training,
        input_ids = input_ids,
        input_mask = input_mask,
        token_type_ids = segment_ids,
        use_one_hot_embeddings = use_one_hot_embeddings,
    )

    output_layer = model.get_pooled_output()
    hidden_size = output_layer.shape[-1].value
    with tf.variable_scope('weights', reuse = reuse_flag):
        output_weights = tf.get_variable(
            'output_weights',
            [num_labels, hidden_size],
            initializer = tf.truncated_normal_initializer(stddev = 0.02),
        )
        output_bias = tf.get_variable(
            'output_bias', [num_labels], initializer = tf.zeros_initializer()
        )

    with tf.variable_scope('loss'):
        def apply_dropout_last_layer(output_layer):
            output_layer = tf.nn.dropout(output_layer, keep_prob = 0.9)
            return output_layer

        def not_apply_dropout(output_layer):
            return output_layer

        output_layer = tf.cond(
            is_training,
            lambda: apply_dropout_last_layer(output_layer),
            lambda: not_apply_dropout(output_layer),
        )
        logits = tf.matmul(output_layer, output_weights, transpose_b = True)
        print(
            'output_layer:',
            output_layer.shape,
            ', output_weights:',
            output_weights.shape,
            ', logits:',
            logits.shape,
        )

        logits = tf.nn.bias_add(logits, output_bias, name = 'logits')
        probabilities = tf.nn.softmax(logits)
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels = labels, logits = logits
        )
        loss = tf.reduce_mean(loss)
        correct_pred = tf.equal(tf.argmax(logits, 1, output_type = tf.int32), labels)
        accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

        return loss, logits, probabilities, model, accuracy

In [14]:
use_one_hot_embeddings = False
loss, logits, probabilities, model, accuracy = create_model(
    bert_config,
    is_training,
    input_ids,
    input_mask,
    segment_ids,
    label_ids,
    dimension_output,
    use_one_hot_embeddings,
)
global_step = tf.Variable(0, trainable = False, name = 'Global_Step')
optimizer = tf.contrib.layers.optimize_loss(
    loss,
    global_step = global_step,
    learning_rate = learning_rate,
    optimizer = 'Adam'
)

output_layer: (?, 256) , output_weights: (2, 256) , logits: (?, 2)


In [15]:
sess.run(tf.global_variables_initializer())

In [16]:
strings = ','.join(
    [
        n.name
        for n in tf.get_default_graph().as_graph_def().node
        if ('Variable' in n.op
        or 'Placeholder' in n.name
        or 'logits' in n.name
        or 'alphas' in n.name)
        and 'Adam' not in n.name
        and 'beta' not in n.name
        and 'OptimizeLoss' not in n.name
        and 'Global_Step' not in n.name
    ]
)

In [17]:
strings.split(',')

['Placeholder_input_ids',
 'Placeholder_input_mask',
 'Placeholder_segment_ids',
 'Placeholder_label_ids',
 'Placeholder_is_training',
 'bert/embeddings/word_embeddings',
 'bert/embeddings/token_type_embeddings',
 'bert/embeddings/position_embeddings',
 'bert/embeddings/LayerNorm/gamma',
 'bert/encoder/layer_0/attention/self/query/kernel',
 'bert/encoder/layer_0/attention/self/query/bias',
 'bert/encoder/layer_0/attention/self/key/kernel',
 'bert/encoder/layer_0/attention/self/key/bias',
 'bert/encoder/layer_0/attention/self/value/kernel',
 'bert/encoder/layer_0/attention/self/value/bias',
 'bert/encoder/layer_0/attention/output/dense/kernel',
 'bert/encoder/layer_0/attention/output/dense/bias',
 'bert/encoder/layer_0/attention/output/LayerNorm/gamma',
 'bert/encoder/layer_0/intermediate/dense/kernel',
 'bert/encoder/layer_0/intermediate/dense/bias',
 'bert/encoder/layer_0/output/dense/kernel',
 'bert/encoder/layer_0/output/dense/bias',
 'bert/encoder/layer_0/output/LayerNorm/gamma',
 

In [18]:
saver = tf.train.Saver(tf.trainable_variables())
saver.save(sess, 'bert/model.ckpt')

'bert/model.ckpt'

In [19]:
vectors = str_idx(texts, dictionary, maxlen)
train_X, test_X, train_Y, test_Y = train_test_split(
    texts, labels, test_size = 0.2
)

In [20]:
from tqdm import tqdm
import time

EARLY_STOPPING, CURRENT_CHECKPOINT, CURRENT_ACC, EPOCH = 3, 0, 0, 0

while True:
    lasttime = time.time()
    if CURRENT_CHECKPOINT == EARLY_STOPPING:
        print('break epoch:%d\n' % (EPOCH))
        break

    train_acc, train_loss, test_acc, test_loss = 0, 0, 0, 0
    pbar = tqdm(
        range(0, len(train_X), batch_size), desc = 'train minibatch loop'
    )
    for i in pbar:
        batch_x = str_idx(train_X[i : min(i + batch_size, len(train_X))], dictionary, maxlen)
        batch_y = train_Y[i : min(i + batch_size, len(train_X))]
        np_mask = np.ones((len(batch_x), maxlen), dtype = np.int32)
        np_segment = np.ones((len(batch_x), maxlen), dtype = np.int32)
        acc, cost, _ = sess.run(
            [accuracy, loss, optimizer],
            feed_dict = {
                input_ids: batch_x,
                label_ids: batch_y,
                input_mask: np_mask,
                segment_ids: np_segment,
                is_training: True
            },
        )
        assert not np.isnan(cost)
        train_loss += cost
        train_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)
        
    pbar = tqdm(range(0, len(test_X), batch_size), desc = 'test minibatch loop')
    for i in pbar:
        batch_x = str_idx(test_X[i : min(i + batch_size, len(test_X))], dictionary, maxlen)
        batch_y = test_Y[i : min(i + batch_size, len(test_X))]
        np_mask = np.ones((len(batch_x), maxlen), dtype = np.int32)
        np_segment = np.ones((len(batch_x), maxlen), dtype = np.int32)
        acc, cost = sess.run(
            [accuracy, loss],
            feed_dict = {
                input_ids: batch_x,
                label_ids: batch_y,
                input_mask: np_mask,
                segment_ids: np_segment,
                is_training: False
            },
        )
        test_loss += cost
        test_acc += acc
        pbar.set_postfix(cost = cost, accuracy = acc)

    train_loss /= len(train_X) / batch_size
    train_acc /= len(train_X) / batch_size
    test_loss /= len(test_X) / batch_size
    test_acc /= len(test_X) / batch_size

    if test_acc > CURRENT_ACC:
        print(
            'epoch: %d, pass acc: %f, current acc: %f'
            % (EPOCH, CURRENT_ACC, test_acc)
        )
        CURRENT_ACC = test_acc
        CURRENT_CHECKPOINT = 0
    else:
        CURRENT_CHECKPOINT += 1

    print('time taken:', time.time() - lasttime)
    print(
        'epoch: %d, training loss: %f, training acc: %f, valid loss: %f, valid acc: %f\n'
        % (EPOCH, train_loss, train_acc, test_loss, test_acc)
    )
    EPOCH += 1

train minibatch loop: 100%|██████████| 16876/16876 [1:36:14<00:00,  3.26it/s, accuracy=0.833, cost=0.398]
test minibatch loop: 100%|██████████| 4219/4219 [08:02<00:00,  9.04it/s, accuracy=0.862, cost=0.35] 
train minibatch loop:   0%|          | 0/16876 [00:00<?, ?it/s]

epoch: 0, pass acc: 0.000000, current acc: 0.784242
time taken: 6256.754328489304
epoch: 0, training loss: 0.478161, training acc: 0.770285, valid loss: 0.454835, valid acc: 0.784242



train minibatch loop: 100%|██████████| 16876/16876 [1:35:42<00:00,  3.32it/s, accuracy=0.722, cost=0.427]
test minibatch loop: 100%|██████████| 4219/4219 [07:54<00:00,  9.21it/s, accuracy=0.828, cost=0.421]
train minibatch loop:   0%|          | 0/16876 [00:00<?, ?it/s]

time taken: 6216.978980302811
epoch: 1, training loss: 0.424994, training acc: 0.804825, valid loss: 0.466114, valid acc: 0.778064



train minibatch loop: 100%|██████████| 16876/16876 [1:35:28<00:00,  3.34it/s, accuracy=0.944, cost=0.31] 
test minibatch loop: 100%|██████████| 4219/4219 [07:56<00:00,  8.86it/s, accuracy=0.828, cost=0.429]
train minibatch loop:   0%|          | 0/16876 [00:00<?, ?it/s]

time taken: 6204.905177354813
epoch: 2, training loss: 0.396685, training acc: 0.821129, valid loss: 0.481994, valid acc: 0.774056



train minibatch loop: 100%|██████████| 16876/16876 [1:35:19<00:00,  3.37it/s, accuracy=0.778, cost=0.328] 
test minibatch loop: 100%|██████████| 4219/4219 [07:59<00:00,  9.08it/s, accuracy=0.828, cost=0.363]

time taken: 6198.127304553986
epoch: 3, training loss: 0.378270, training acc: 0.830598, valid loss: 0.501795, valid acc: 0.770249

break epoch:4






In [21]:
real_Y, predict_Y = [], []

pbar = tqdm(
    range(0, len(test_X), batch_size), desc = 'validation minibatch loop'
)
for i in pbar:
    batch_x = str_idx(test_X[i : min(i + batch_size, len(test_X))], dictionary, maxlen)
    batch_y = test_Y[i : min(i + batch_size, len(test_X))]
    np_mask = np.ones((len(batch_x), maxlen), dtype = np.int32)
    np_segment = np.ones((len(batch_x), maxlen), dtype = np.int32)
    predict_Y += np.argmax(
        sess.run(
            logits,
            feed_dict = {
                input_ids: batch_x,
                label_ids: batch_y,
                input_mask: np_mask,
                segment_ids: np_segment,
                is_training: False,
            },
        ),
        1,
    ).tolist()
    real_Y += batch_y

validation minibatch loop: 100%|██████████| 4219/4219 [07:55<00:00,  9.23it/s]


In [22]:
from sklearn import metrics
print(metrics.classification_report(real_Y, predict_Y, target_names = ['negative','positive']))

              precision    recall  f1-score   support

    negative       0.75      0.83      0.79     70558
    positive       0.79      0.70      0.75     64447

   micro avg       0.77      0.77      0.77    135005
   macro avg       0.77      0.77      0.77    135005
weighted avg       0.77      0.77      0.77    135005



In [23]:
np_mask = np.ones((1, maxlen), dtype = np.int32)
np_segment = np.ones((1, maxlen), dtype = np.int32)
text = classification_textcleaning(
    'kerajaan sebenarnya sangat bencikan rakyatnya, minyak naik dan segalanya'
)
new_vector = str_idx([text[0]], dictionary, maxlen)
sess.run(
    tf.nn.softmax(logits),
    feed_dict = {
        input_ids: new_vector,
        input_mask: np_mask,
        segment_ids: np_segment,
        is_training: False,
    },
)


array([[0.9040442 , 0.09595579]], dtype=float32)

In [24]:
np_mask = np.ones((1, maxlen), dtype = np.int32)
np_segment = np.ones((1, maxlen), dtype = np.int32)
text = classification_textcleaning(
    'kerajaan sebenarnya sangat sayangkan rakyatnya, tetapi sebenarnya benci'
)
new_vector = str_idx([text[0]], dictionary, maxlen)
sess.run(
    tf.nn.softmax(logits),
    feed_dict = {
        input_ids: new_vector,
        input_mask: np_mask,
        segment_ids: np_segment,
        is_training: False,
    },
)


array([[0.9040442 , 0.09595579]], dtype=float32)

In [25]:
saver.save(sess, 'bert/model.ckpt')

'bert/model.ckpt'

In [26]:
import json
with open('bert-sentiment.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))

In [27]:
def freeze_graph(model_dir, output_node_names):

    if not tf.gfile.Exists(model_dir):
        raise AssertionError(
            "Export directory doesn't exists. Please specify an export "
            'directory: %s' % model_dir
        )

    checkpoint = tf.train.get_checkpoint_state(model_dir)
    input_checkpoint = checkpoint.model_checkpoint_path

    absolute_model_dir = '/'.join(input_checkpoint.split('/')[:-1])
    output_graph = absolute_model_dir + '/frozen_model.pb'
    clear_devices = True
    with tf.Session(graph = tf.Graph()) as sess:
        saver = tf.train.import_meta_graph(
            input_checkpoint + '.meta', clear_devices = clear_devices
        )
        saver.restore(sess, input_checkpoint)
        output_graph_def = tf.graph_util.convert_variables_to_constants(
            sess,
            tf.get_default_graph().as_graph_def(),
            output_node_names.split(','),
        )
        with tf.gfile.GFile(output_graph, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print('%d ops in the final graph.' % len(output_graph_def.node))

In [28]:
freeze_graph('bert', strings)

INFO:tensorflow:Restoring parameters from bert/model.ckpt
INFO:tensorflow:Froze 41 variables.
INFO:tensorflow:Converted 41 variables to const ops.
375 ops in the final graph.


In [29]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [30]:
g = load_graph('bert/frozen_model.pb')

In [31]:
[n.name for n in g.as_graph_def().node]

['import/Placeholder_input_ids',
 'import/Placeholder_input_mask',
 'import/Placeholder_segment_ids',
 'import/Placeholder_label_ids',
 'import/Placeholder_is_training',
 'import/bert/embeddings/ExpandDims/dim',
 'import/bert/embeddings/ExpandDims',
 'import/bert/embeddings/word_embeddings',
 'import/bert/embeddings/word_embeddings/read',
 'import/bert/embeddings/embedding_lookup/axis',
 'import/bert/embeddings/embedding_lookup',
 'import/bert/embeddings/embedding_lookup/Identity',
 'import/bert/embeddings/Shape',
 'import/bert/embeddings/strided_slice/stack',
 'import/bert/embeddings/strided_slice/stack_1',
 'import/bert/embeddings/strided_slice/stack_2',
 'import/bert/embeddings/strided_slice',
 'import/bert/embeddings/Reshape/shape/1',
 'import/bert/embeddings/Reshape/shape/2',
 'import/bert/embeddings/Reshape/shape',
 'import/bert/embeddings/Reshape',
 'import/bert/embeddings/Shape_1',
 'import/bert/embeddings/strided_slice_1/stack',
 'import/bert/embeddings/strided_slice_1/stack_1

In [32]:
placeholder_input_ids = g.get_tensor_by_name('import/Placeholder_input_ids:0')
placeholder_input_mask = g.get_tensor_by_name('import/Placeholder_input_mask:0')
placeholder_segment_ids = g.get_tensor_by_name('import/Placeholder_segment_ids:0')
placeholder_is_training = g.get_tensor_by_name('import/Placeholder_is_training:0')
loss_logits = g.get_tensor_by_name('import/loss/logits:0')
test_sess = tf.InteractiveSession(graph = g)
test_sess.run(
    tf.nn.softmax(loss_logits),
    feed_dict = {
        placeholder_input_ids: new_vector,
        placeholder_input_mask: np_mask,
        placeholder_segment_ids: np_segment,
        placeholder_is_training: False,
    },
)



array([[0.9040442 , 0.09595579]], dtype=float32)