In [8]:
import pandas as pd
import nltk
import re
import ast
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemover, ArrayDictionary
import csv

In [9]:
## default awalnya - lalu cek nvidia-smi untuk tahu GPU mana yang masih minim digunakan
import torch
print(torch.cuda.current_device())

0


In [10]:
torch.cuda.set_device(7)
print(torch.cuda.current_device())

7


In [64]:
!nvidia-smi

Thu Apr 28 00:34:50 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.126.02   Driver Version: 418.126.02   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   39C    P0    64W / 300W |  20135MiB / 32480MiB |      4%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   42C    P0    72W / 300W |   8878MiB / 32480MiB |     29%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla V100-SXM2...  On   | 00000000:0A:00.0 Off |                    0 |
| N/A   

In [12]:
import os, sys
sys.path.append('../')
os.chdir('../')

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer
from nltk.tokenize import word_tokenize

from modules.word_classification import BertForWordClassification
from utils.forward_fn import forward_word_classification
from utils.metrics import pos_tag_metrics_fn
from utils.data_utils import PosTagIdnDataset, PosTagDataLoader

2022-04-27 17:56:35.873320: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Pre-processing

In [13]:
def load_kamus():
    kamus = pd.read_csv('/workspace/BISINDO/Translation/kamus.csv',delimiter=';')
    kata_bahasa = kamus['Kata Bahasa Indonesia']
    kata_bisindo = kamus['Kata Bisindo']
    kamus_bahasa_bisindo = dict(zip(kata_bahasa, kata_bisindo))
    
    return kamus_bahasa_bisindo

In [14]:
print(load_kamus())

{'berjalan': 'jalan-kaki', 'milik': 'punya', 'anda': 'kamu', 'ceramah': 'nasehat', 'andaikan': 'kalau', 'diperkenankan': 'boleh'}


In [15]:
# buang tanda baca,
# buang extra whitespaces,
# ganti kata-kata sesuai dengan padanan kamus
def normalisasi(kal):
    #clean_str = kal.lower() #lowercase
    clean_str = re.sub(r'[!"#$%&\'()*+, -./:;<=>@\[\]\\^_`{|}~]', " ", kal) #buang tanda baca, kecuali tanda tanya
    list_kata = re.findall(r"[\w']+|[?]", clean_str) #memisahkan tanda baca tanya dari kata
    
    #ganti padanan di kamus
    normal_kata = []
    for kata in list_kata:
        daftar_kamus = load_kamus()
        if kata.lower() in daftar_kamus:
            kata = daftar_kamus[kata.lower()]
        normal_kata.append(kata)
    result = ' '.join(normal_kata)

    return result

In [16]:
normalisasi('apakah lawan kata "berat", "ringan"?')

'apakah lawan kata berat ringan ?'

In [139]:
# prepare testing data

def change(source_file):  
    root_dir = f'/workspace/BISINDO/Translation/Huda/'
    f = open(f'{root_dir}/{source_file}', 'r')
    output_txt = open(f'{root_dir}/[test]-{source_file}', 'w')   

    mod_cnt = 0
    cnt = 0
    for kal in f:
        kal_normed = normalisasi(kal)
#         pos_seq, word_seq = extract_word_tag(kal_normed)
        word_seq = kal_normed.split()
        word_seq = filter_stop_words(word_seq)
        word_seq = filter_preposisi(word_seq)
        word_seq = stem(word_seq)
        bindo_setengah_jadi = ' '.join(word_seq)

        output_txt.write(f'{bindo_setengah_jadi}\n')

    output_txt.close()
    f.close()

In [150]:
# mengubah kalimat bahasa indonesia menjadi kalimat setengah jadi (tanpa imbuhan dan preposisi)

files_to_be_changed = [
    '[bindo2]-bindo368.txt'
]

for el in files_to_be_changed:
    print(f'Processing {el}...')
    change(el)
    print('=======')

Processing [bindo2]-bindo368.txt...


### POS TAGGING PER TOKEN KATA

In [17]:
## model pos taggingnya menggunakan model yang sudah ditrain sebelumnya
model = torch.load('/workspace/BISINDO/POS-MODEL/model_v1/model.pth')



In [18]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-large-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-large-p1')
config.num_labels = PosTagIdnDataset.NUM_LABELS

In [19]:
w2i, i2w = PosTagIdnDataset.LABEL2INDEX, PosTagIdnDataset.INDEX2LABEL

In [20]:
def word_subword_tokenize(sentence, tokenizer):
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS

    # Add subwords
    for word_idx, word in enumerate(sentence):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]

    return subwords, subword_to_word_indices

In [21]:
def prediksi_pos_tag(kalimat):
    text = word_tokenize(kalimat)
    subwords, subword_to_word_indices = word_subword_tokenize(text, tokenizer)

    subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
    subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
    logits = model(subwords, subword_to_word_indices)[0]

    preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
    labels = [i2w[preds[i]].split('-')[1] for i in range(len(preds))]
    return labels
    
#     #menambahkan penghubung + antar pos tag
#     string_lb = ''
#     for lb in labels:
#         string_lb += lb + "+"
    
#     #memberi indeks 2,3 dst untuk pos tag yang munculnya berulang
#     x2 = []
#     d = {}
#     for x in labels:
#         d[x] = d.get(x, 0) + 1
#         if d[x] > 1:
#             x = x + str(d[x])
#         x2.append(x)
#     str2 = '+'.join(x2)
#     string_lb = string_lb[:-1]
#     return str2

In [22]:
prediksi_pos_tag('memberi indeks 2 3 dst untuk pos tag yang munculnya berulang')

['VB', 'NN', 'CD', 'CD', 'NN', 'IN', 'NN', 'NN', 'SC', 'NN', 'VB']

In [88]:
prediksi_pos_tag('kilang minyak balongan terdapat di lepas pantai')

['NN', 'NN', 'NNP', 'VB', 'IN', 'JJ', 'JJ']

In [89]:
prediksi_pos_tag('setiap lewat tengah malam, suara itu pasti terdengar')
## no preposisi in test
# lewat

['CD', 'VB', 'NN', 'NN', 'Z', 'NN', 'PR', 'RB', 'VB']

In [90]:
prediksi_pos_tag('harga per kilogramnya rp 17.500')
## no preposisi in test
# per

['NN', 'IN', 'NND', 'SYM', 'CD']

In [91]:
prediksi_pos_tag('lemari dan meja serta kursi ini saya beli ketika harganya belum melambung') #serta

['NN', 'CC', 'NN', 'CC', 'NN', 'PR', 'PRP', 'VB', 'SC', 'NN', 'NEG', 'VB']

In [92]:
prediksi_pos_tag('lelaki itu berbicara tanpa basa - basi') #tanpa

['NN', 'PR', 'VB', 'IN', 'NN', 'NN', 'NN']

In [93]:
prediksi_pos_tag('ayah beserta ibu sedang pergi ke makassar') #beserta

['NN', 'VB', 'NN', 'MD', 'VB', 'IN', 'NNP']

In [94]:
prediksi_pos_tag('dia selalu pergi menjelang malam apabila naik kereta ke surabaya') #menjelang

['PRP', 'RB', 'VB', 'IN', 'NN', 'SC', 'VB', 'NN', 'IN', 'NNP']

In [95]:
prediksi_pos_tag('dua wanita itu pergi menuju ke jembatan penyebrangan') #menuju

['CD', 'NN', 'PR', 'VB', 'VB', 'IN', 'NN', 'NN']

In [96]:
prediksi_pos_tag('sumi cantik bagaikan bidadari') #bagaikan

['NNP', 'JJ', 'IN', 'NN']

In [156]:
prediksi_pos_tag('tari pendet berasal bali')

['NN', 'NNP', 'VB', 'NNP']

### Utilities (Ronaldi)

In [26]:
def match(pos_seq, rule):
    if 'WH' in rule and pos_seq[-1] != 'Z':
        return -1
    
    for i in range(len(pos_seq) - len(rule) + 1):
        sliced_seq = pos_seq[i:i+len(rule)]
        # print('SLICED',sliced_seq)

        if rule == ['S','CC','S']:
            first_tag = sliced_seq[0] in ['PRP', 'NN', 'NNP']
            second_tag = sliced_seq[1] == 'CC'
            third_tag = sliced_seq[2] in ['PRP', 'NN', 'NNP']
            if first_tag and second_tag and third_tag:
                return i
            else:
                continue

        if 'S' in rule:
            first_tag = sliced_seq[0]
            if first_tag in ['PRP', 'NN', 'NNP'] and sliced_seq[1:] == rule[1:]:
                return i
            else:
                continue            

        if sliced_seq == rule:
            return i
    return -1

def remove_matched_seq(seq, matched_index, removed_size):
    remaining = seq
    removed = [remaining.pop(matched_index) for i in range(removed_size)]
    return removed, remaining

def insert_transformed_seq(seq, insertion, start_index, rule):
    if not insertion:
        return seq
    elif 'WH' in rule and rule != ['WH','SC']:
        seq[-1:-1] = insertion
    else:
        seq[start_index:start_index] = insertion
    return seq

def process_seq(seq, matched_index, rule, fn):
    matched_size = len(rule)
    removed_seq, remaining_seq = remove_matched_seq(seq, matched_index, matched_size)
    result = insert_transformed_seq(remaining_seq, fn(removed_seq), matched_index, rule)
    return result

### Utilities (Huda)

In [27]:
from nltk.util import ngrams

def ngram(kalimat, k):
#     kalimat_n = kalimat.split('\n')
#     kalimat_cleaned = ' '.join(kalimat_n[:-1])
    kalimat_split = kalimat.split(' ')
    ngram = list(ngrams(kalimat_split, n=k))
    
    return ngram

### Extract Word Tag

modul ini digunakan untuk memisahkan pos tag, kata dan pasangan pos-tag kata

In [28]:
def extract_word_tag(kalimat):
    pos_word_pair = []
    pos_seq = prediksi_pos_tag(kalimat)
    word_seq = kalimat.split()
    
    for (pos,word) in zip(pos_seq,word_seq):
        pos_word_pair.append((pos,word))
    
    ### pembetulan jika terdapat kata TIDAK BOLEH, tag nya salah sehingga perlu dibetulkan
    
    matched_index = match(pos_word_pair, [('RB','tidak'), ('RB','boleh')])
    if matched_index != -1:        
        pos_seq = process_seq(pos_seq, matched_index, ['RB','RB'], lambda x: ['NEG','MD'])
        word_seq = process_seq(word_seq, matched_index, ['RB','RB'], lambda x: x)
    
    return pos_seq, word_seq

In [29]:
extract_word_tag('kamu tidak boleh melakukan hal itu')

(['PRP', 'NEG', 'MD', 'VB', 'NN', 'PR'],
 ['kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu'])

### stemming

In [30]:
def stem(word_seq):
    result = []
    for i in range(len(word_seq)):
        word = word_seq[i]
        
        if word == '?':
            result.append(word)
            continue
        
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()   
        stemmed_word = stemmer.stem(word)
        daftar_kamus = load_kamus()
        if stemmed_word in daftar_kamus:
            stemmed_word = daftar_kamus[stemmed_word]
        result.append(stemmed_word)
            
    return result

In [31]:
stem(['kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu', '?'])

['kamu', 'tidak', 'boleh', 'laku', 'hal', 'itu', '?']

### filtering stop words (modifikasi)

In [33]:
#stopwords excluding preposisi

stopwords = [
             'kepada',
             'sedang',
             'amat', 'anu',
             'dong',
             'ialah', 'adalah',
             'kah', 'kan',
             'lah',
             'merupakan',
             'nah', 'para',
             'pihak', 'pun',
             'saja', 'sangat',
             'sangatlah', 'sebagai',
             'secara', 'sedangkan',
             'sehingga', 'selaku',
             'terdiri', 'terlalu',
             'wong', 'yaitu',
             'yakni', 'yang'
]

In [34]:
dictionary = ArrayDictionary(stopwords)
sw_remover = StopWordRemover(dictionary)

def filter_stop_words(word_seq):
    for i in range(len(word_seq)-1, 0, -1):
        word = word_seq[i]
        if word in stopwords:
            del word_seq[i]
    return word_seq

In [35]:
filter_stop_words(
    ['kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu', 'di', 'sedang'])

['kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu', 'di']

### filtering preposisi (huda)

In [36]:
preposisi = [
             'dari', 'sejak',
             'di', 'dalam',
             'pada', 'bagi',
             'dengan', 'atas',
             'terhadap', 'tentang',
             'untuk', 'oleh',
             'ke', 'seperti',
             'antar',
             'buat', 'guna',
             'ibarat', 'sambil',
             'bahkan', 'akan',
             'antara', 'demi',
             'hingga', 'kecuali',
             'lepas', 'lewat',
             'per', 'sampai',
             'serta', 'tanpa',
]

In [148]:
dictionary = ArrayDictionary(stopwords)
sw_remover = StopWordRemover(dictionary)

def filter_preposisi(word_seq):
    for i in range(len(word_seq)-1, -1, -1):
        word = word_seq[i]
        if word in preposisi:
            del word_seq[i]
    return word_seq

In [149]:
filter_preposisi(['di', 'kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu', 'di', 'sedang'])

['kamu', 'tidak', 'boleh', 'melakukan', 'hal', 'itu', 'sedang']


### Modul untuk menghitung BLEU SCORE

In [39]:
import datasets
bleu = datasets.load_metric('bleu')

In [40]:
'''
format input bleu_score:
predictions = [
    ["hello", "there", "general", "kenobi"],                             # tokenized prediction of the first sample
    ["foo", "bar", "foobar"]                                             # tokenized prediction of the second sample
    ]
references = [
    [["hello", "there", "general", "kenobi"], ["hello", "there", "!"]],  # tokenized references for the first sample (2 references)
    [["foo", "bar", "foobar"]]                                           # tokenized references for the second sample (1 reference)
    ]
    
defaultnya max_order yang digunakan = 4 (n-gram 4)
semakin banyak max order maka nilai BLEU cenderung makin kecil karena urutan harus sama sebanyak n-gram
'''

def hitung_bleu_score(predictions, references):
    results = bleu.compute(predictions=predictions, references=references, max_order=1) 
    return results

## Modul untuk menghitung Word Error Rate

In [41]:
from jiwer import wer

def hitung_wer(reference, prediksi):
    wer_ = wer(reference, prediksi)
    return wer_

### pencarian rules dan transformasi

In [97]:
rules = [
            (['NNP','VB','NNP'], 2, 'dari'),                 #1
            (['CC','NN','NN'], 2, 'sejak'),                  #2
            (['NN','NN','NN'], 1, 'sejak'),                  #3
            (['NN','VB','NN'], 2, 'di'),                     #4
#             (['NN','VB','NN'], 2, 'dalam'),                #5
            (['IN','NN','NN'], 2, 'pada'),                   #6
            (['PRP','NEG','JJ'], 0, 'bagi'),                 #7
#             (['MD','VB','NN'], 2, 'dengan'),               #8
            (['PR','VB','NN'], 2, 'atas'),                   #9
            (['NN','NN','RB'], 1, 'terhadap'),               #10
            (['VB','NN','NN'], 2, 'tentang'),                #11
            (['VB','NN','PRP'], 2, 'untuk'),                 #12
            (['NN','JJ','NN'], 2, 'oleh'),                   #13
            (['PRP','VB','NNP'], 2, 'ke'),                   #14
            (['PR','NN','NN'], 2, 'seperti'),                #15
            (['MD','VB','NN'], 2, 'akan'),                   #16        
            (['NN','NNP','CC'], 1, 'antara'),                #17
            (['VB','JJ','NN'], 2, 'demi'),                   #18
            (['NN','JJ','OD'], 1, 'hingga'),                 #19
            (['VB','Z','NN'], 2, 'kecuali'),                 #20    
            (['NN','CD','NN'], 2, 'sampai'),                 #21    
            (['NN','NND','SYM'], 1, 'per'),                  #22     
            (['VB','NN','NN'], 1, 'tanpa'),                  #23     
            (['VB','NN','SC'], 1, 'menjelang'),              #24     
            (['NNP','JJ','NN'], 2, 'bagaikan'),              #25     
]

In [49]:
## manipulasi tuple
def modify(pos_seq, word_seq, rules, word_seq_join):
    tokenized_word_seq = word_seq_join.split()
    cnt = 0
    for i in range(len(pos_seq)):
        curr_pos = list(pos_seq[i])
        curr_word = list(word_seq[i])
        for j in range(len(rules)):
            rule = rules[j][0]
            index = rules[j][1]
            kata = rules [j][2]
            
            if (curr_pos == rule):
#                 print("trigram index ke : " + str(i) + " : rule ke " + str(j))
#                 print(str(index) + " " + kata)
                cnt += index
                tokenized_word_seq.insert(cnt, kata)
#                 print(tokenized_word_seq)
#                 print("======================")
        cnt += 1
    return tokenized_word_seq
    

## Testing/Prediksi Kalimat Akhir

In [143]:
list_all_pred = []
list_all_ref = []
list_pred_kal = []
list_ref_kal = []

In [146]:
# prepare testing data
def clean(source_file):  
    root_dir = f'/workspace/BISINDO/Translation/Huda/'
    f = open(f'{root_dir}/{source_file}', 'r')
    k = open(f'{root_dir}/[bindo2]-bindo368.txt', 'r')
    output_txt = open(f'{root_dir}/[processed]-{source_file}', 'w')    
#     output_csv = open(f'{root_dir}/[csv]-{source_file}.csv', 'w')

    for i in range(367):
#     for kal in f:
        kal_pred = f.readline()
        kal_ref = k.readline()
        
        pos_seq, word_seq = extract_word_tag(kal_pred)
        pos_seq_join = (' ').join(pos_seq)
        word_seq_join = (' ').join(word_seq)
        

        trigram_word_seq = ngram(word_seq_join, 3)
        trigram_pos_seq = ngram(pos_seq_join, 3)
        
        prediksi = modify(trigram_pos_seq, trigram_word_seq, rules, word_seq_join)
        
        print("kalimat prediksi: ", (' ').join(prediksi))
        print("kalimat referensi: ", kal_ref)
        print(pos_seq_join)
        
        list_ref = kal_ref.split()
        list_all_pred.append(prediksi)
        list_all_ref.append([kal_ref.split()])
        list_pred_kal.append((' ').join(prediksi))
        list_ref_kal.append(kal_ref)
        
#         bleu_individual = hitung_bleu_score([prediksi],[[list_ref]])
#         bleu_score_individual = bleu_individual['bleu']*100
        wer_individual = hitung_wer([kal_ref], [(' ').join(prediksi)])
        print("WER individual: ", wer_individual)
#         print("BLEU Score Per Kalimat :", float("{:.2f}".format(bleu_score_individual)))
        print("----------------------------------")
        
#         print(trigram_word_seq)
#         print(trigram_pos_seq)
#         is_modified, modified_word, applied_rules = transform(pos_seq, word_seq, rules)
#         prediksi = ' '.join(modified_word)
        
#         output_txt.write(f'{prediksi}\t{kal}')
#         output_csv.write(f'{kal_normed},{prediksi},{is_modified},{applied_rules}\n')
    
#     output_csv.close()
    bleu_score = hitung_bleu_score(list_all_pred,list_all_ref)
    wer_ = hitung_wer(list_ref_kal, list_pred_kal)
    print("BLEU Score Keseluruhan: ",bleu_score['bleu']*100)
    print("WER: ", wer_)
    output_txt.close()
    f.close()

In [151]:
# menambahkan preposisi pada kelimat bahasa indonesia setengah jadi
files_to_be_translated = [
    '[test]-[bindo2]-bindo368.txt'
]

for el in files_to_be_translated:
    print(f'Processing {el}...')
    clean(el)
    print('=======')

Processing [test]-[bindo2]-bindo368.txt...
kalimat prediksi:  nanti saya nonton bioskop
kalimat referensi:  nanti saya akan nonton ke bioskop

NN PRP VB NN
WER individual:  0.3333333333333333
----------------------------------
kalimat prediksi:  besok pagi saya belanja pasar
kalimat referensi:  besok pagi saya akan belanja ke pasar

NN NN PRP NN NN
WER individual:  0.2857142857142857
----------------------------------
kalimat prediksi:  tidak boleh bicara akan sekarang
kalimat referensi:  serta tidak boleh bicara sekarang

NEG MD VB NN
WER individual:  0.4
----------------------------------
kalimat prediksi:  warga tidak boleh mudik akan akhir tahun tentang tanpa
kalimat referensi:  warga tidak boleh mudik akhir tahun

NN NEG MD VB NN NN
WER individual:  0.5
----------------------------------
kalimat prediksi:  tiap hari ayah olahraga lama 1 jam
kalimat referensi:  tiap hari ayah olahraga lama 1 jam

CD NN NN VB JJ CD NN
WER individual:  0.0
----------------------------------
kalimat p

### Save Prediction Ke CSV

In [50]:
df.to_csv('/workspace/BISINDO/Translation/dataset-ronaldi/Hasil_Prediksi.csv', index=False)


In [None]:
torch.cuda.empty_cache()