In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import json
import re
import sentencepiece as spm
import tensorflow as tf
import numpy as np

In [3]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.bert.model')

with open('sp10m.cased.bert.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [4]:
sp_model = spm.SentencePieceProcessor()
sp_model.Load('sp10m.cased.v9.model')

def tokenize_fn(text):
    text = preprocess_text(text, lower= False)
    return encode_ids(sp_model, text)

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}

VOCAB_SIZE = 32000
UNK_ID = special_symbols["<unk>"]
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]
MASK_ID = special_symbols["<mask>"]
EOD_ID = special_symbols["<eod>"]

In [5]:
from malaya.text.rules import normalized_chars
from unidecode import unidecode
import random

laughing = {
    'huhu',
    'haha',
    'gagaga',
    'hihi',
    'wkawka',
    'wkwk',
    'kiki',
    'keke',
    'huehue',
    'hshs',
    'hoho',
    'hewhew',
    'uwu',
    'sksk',
    'ksks',
    'gituu',
    'gitu',
    'mmeeooww',
    'meow',
    'alhamdulillah',
    'muah',
    'mmuahh',
    'hehe',
    'salamramadhan',
    'happywomensday',
    'jahagaha',
    'ahakss',
    'ahksk'
}

def make_cleaning(s, c_dict):
    s = s.translate(c_dict)
    return s

def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    
    string = ' '.join(
        [make_cleaning(w, normalized_chars) for w in string.split()]
    )
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    x = []
    for word in string:
        if any([laugh in word for laugh in laughing]):
            if random.random() >= 0.5:
                x.append(word)
        else:
            x.append(word)
    string = [w.title() if w[0].isupper() else w for w in x]
    return ' '.join(string)

In [6]:
cleaning('hallo')

'hallo'

In [7]:
def load_graph(frozen_graph_filename):
    with tf.gfile.GFile(frozen_graph_filename, 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
    with tf.Graph().as_default() as graph:
        tf.import_graph_def(graph_def)
    return graph

In [8]:
g = load_graph('bert-base-sentiment/frozen_model.pb')
x = g.get_tensor_by_name('import/Placeholder:0')
mask = g.get_tensor_by_name('import/Placeholder_1:0')
logits = tf.nn.softmax(g.get_tensor_by_name('import/logits:0'))
test_sess = tf.InteractiveSession(graph = g)

In [9]:
g_xlnet = load_graph('xlnet-base-sentiment/frozen_model.pb')
x_xlnet = g_xlnet.get_tensor_by_name('import/Placeholder:0')
seg_xlnet = g_xlnet.get_tensor_by_name('import/Placeholder_1:0')
m_xlnet = g_xlnet.get_tensor_by_name('import/Placeholder_2:0')
logits_xlnet = tf.nn.softmax(g_xlnet.get_tensor_by_name('import/logits:0'))
test_sess_xlnet = tf.InteractiveSession(graph = g_xlnet)

In [10]:
def str_to_ids(text):
    tokens_a = tokenizer.tokenize(text)
    tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    return input_id, input_mask

input_id, input_mask = str_to_ids('hello')

In [11]:
def XY(left_train):
    X, segments, masks = [], [], []
    for i in range(len(left_train)):
        tokens_a = tokenize_fn(left_train[i])
        segment_id = [SEG_ID_A] * len(tokens_a)
        tokens_a.append(SEP_ID)
        tokens_a.append(CLS_ID)
        segment_id.append(SEG_ID_A)
        segment_id.append(SEG_ID_CLS)
        input_mask = [0] * len(tokens_a)
        X.append(tokens_a)
        segments.append(segment_id)
        masks.append(input_mask)
    return X, segments, masks



In [12]:
labels = ['Negative', 'Neutral', 'Positive']

In [13]:
result = test_sess.run(logits, feed_dict = {x: [input_id], mask: [input_mask]})
result

array([[0.18384206, 0.81245416, 0.0037037 ]], dtype=float32)

In [14]:
input_id, input_segment, input_mask = XY(['hello'])

In [15]:
result = test_sess_xlnet.run(logits_xlnet, feed_dict = {x_xlnet: input_id, m_xlnet: input_mask,
                                                       seg_xlnet: input_segment})
result

array([[2.1995971e-02, 9.7793204e-01, 7.2078059e-05]], dtype=float32)

In [28]:
from glob import glob

twitter = []
for file in glob('*twitter-filtered.txt'):
    with open(file) as fopen:
        twitter.extend(fopen.read().split('\n'))

In [29]:
len(twitter)

400014

In [18]:
with open('politics.txt') as fopen:
    politics = fopen.read().split('\n')
    
len(politics)

30707

In [23]:
from tqdm import tqdm
import pandas as pd

batch_size = 10
pad_sequences = tf.keras.preprocessing.sequence.pad_sequences

In [30]:

results = []
for i in tqdm(range(0, len(twitter), batch_size)):
    train_ids, input_masks, accepted, cleaned = [], [], [], []
    for t in twitter[i: i + batch_size]:
        t_ = cleaning(t)
        if len(t_):
            input_id, input_mask = str_to_ids(t_)
            train_ids.append(input_id)
            input_masks.append(input_mask)
            accepted.append(t)
            cleaned.append(t_)
    train_ids = pad_sequences(train_ids, padding='post')
    input_masks = pad_sequences(input_masks, padding='post')
    result = test_sess.run(logits, feed_dict = {x: train_ids, mask: input_masks})
    
    input_id, input_segment, input_mask = XY(cleaned)
    input_id = pad_sequences(input_id, padding='post')
    input_mask = pad_sequences(input_mask, padding='post', value = 1)
    input_segment = pad_sequences(input_segment, padding='post', value = 4)
    result_xlnet = test_sess_xlnet.run(logits_xlnet, feed_dict = {x_xlnet: input_id, m_xlnet: input_mask,
                                                       seg_xlnet: input_segment})
    result = (result + result_xlnet) / 2
    for no, row in enumerate(result):
        if len(row[row > 0.85]):
            a = np.argmax(row)
            results.append({'text': accepted[no], 'label': labels[a], 'prob': row[a]})

100%|██████████| 40002/40002 [38:01<00:00, 17.54it/s]


In [31]:
len(results)

185787

In [33]:
results[-10:]

[{'text': '@JakeSimRawr kapit jake shhsgsddhasdgahga',
  'label': 'Neutral',
  'prob': 0.999179},
 {'text': 'Pelanjutan Perintah Kawalan Pergerakan Diperketatkan (PKPD) Kawasan Terlibat: -Taman Pelangi, Semporna -Kampung T https://t.co/AmLVEYGFIA',
  'label': 'Neutral',
  'prob': 0.99893045},
 {'text': 'PERHATIAN: STATUS ARAS AIR SEMASA di SARAWAK , Miri, 18/07/2021 01:15, Marudi, Aras air sungai adalah 3.39m iait https://t.co/EPsvW4rZAw',
  'label': 'Neutral',
  'prob': 0.9993696},
 {'text': 'End cap flasing. Menghalang air hujan drip masuk kedalam bumbung. Kesannnya kalau endcap tak ada, kayu manis jadi r https://t.co/u32eJAH7UW',
  'label': 'Negative',
  'prob': 0.99535036},
 {'text': 'So lepas ni nak cari pakwe kena cari yang ada rupa. Takde rupa ni pon sama keparat je ada betina lain belakang kita https://t.co/SXvyFb7Ooq',
  'label': 'Negative',
  'prob': 0.9984772},
 {'text': 'Tahukah anda? Varian baharu #DELTA merupakan Variant Of Concern (VOC) paling dominan di seluruh dunia da

In [35]:
df = pd.DataFrame(results)

In [36]:
len(df)

185787

In [37]:
df.to_csv('semisupervised-bert-xlnet.csv', index = False)

In [19]:

results = []
for i in tqdm(range(0, len(politics), batch_size)):
    train_ids, input_masks, accepted, cleaned = [], [], [], []
    for t in politics[i: i + batch_size]:
        t_ = cleaning(t)
        if len(t_):
            input_id, input_mask = str_to_ids(t_)
            train_ids.append(input_id)
            input_masks.append(input_mask)
            accepted.append(t)
            cleaned.append(t_)
    train_ids = pad_sequences(train_ids, padding='post')
    input_masks = pad_sequences(input_masks, padding='post')
    result = test_sess.run(logits, feed_dict = {x: train_ids, mask: input_masks})
    
    input_id, input_segment, input_mask = XY(cleaned)
    input_id = pad_sequences(input_id, padding='post')
    input_mask = pad_sequences(input_mask, padding='post', value = 1)
    input_segment = pad_sequences(input_segment, padding='post', value = 4)
    result_xlnet = test_sess_xlnet.run(logits_xlnet, feed_dict = {x_xlnet: input_id, m_xlnet: input_mask,
                                                       seg_xlnet: input_segment})
    result = (result + result_xlnet) / 2
    for no, row in enumerate(result):
        if len(row[row > 0.85]):
            a = np.argmax(row)
            results.append({'text': accepted[no], 'label': labels[a], 'prob': row[a]})

100%|██████████| 3071/3071 [03:11<00:00, 16.02it/s]


In [20]:
len(results)

23029

In [21]:
results[:10]

[{'text': 'Menggelabah masing-masing nak beraya kan. Lepastu heboh kerajaan gagal. Idiot',
  'label': 'Negative',
  'prob': 0.9993708},
 {'text': 'Bendera putih tu bukannya nak harap bantuan kerajaan pon pada asalnya, hanya untuk rakyat jaga sesama rakyat. Ahli https://t.co/P8lqGaRpMj',
  'label': 'Negative',
  'prob': 0.9994317},
 {'text': 'Nape nak kena tampal gambar Dia pulak...Guna duit sendiri ke atau Guna duit kerajaan ...',
  'label': 'Negative',
  'prob': 0.9994585},
 {'text': 'Sudah sudah la kak mas woi. Kamu tahu tak semua ahli umno bahagian kuale tak sokong kamu masa pru14. Kami bukan banggang mcm kamu. https://t.co/uTrGsm62gh',
  'label': 'Negative',
  'prob': 0.9993581},
 {'text': 'Kerajaan kita bukan serba boleh, tapi serba bodoh bahalol.',
  'label': 'Negative',
  'prob': 0.99945045},
 {'text': '@magmalaya Maybe azmin rase selangor n kl dh keluar malaysia.kalo gini,do we get all those ahli politik..tak kan..',
  'label': 'Negative',
  'prob': 0.9994402},
 {'text': 'Penat

In [24]:
df = pd.DataFrame(results)

In [25]:
df.to_csv('semisupervised-politics-bert-xlnet.csv', index = False)