# 1. Importing libraries and datasets

## 1.1 Importing torch

In [None]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## 1.2 Importing language detection

- For Malay, we use dictionaries from IPA-Dict and Dewan Bahasa.
- For English, we use NLTK corpus.

In [85]:
import json
import nltk

# my_raw1 = json.load(open('../dictionary/200k-english-malay.json'))
# my_raw2 = open('../dictionary/en-ms.txt', encoding="utf8")
# my_raw3 = open('../dictionary/malay-ipa-dict.txt', encoding="utf8")

# my_raw1 = [x[1] for x in my_raw1]
# my_raw2 = [x.split('\t')[1].strip() for x in my_raw2.readlines()]
# my_raw3 = [x.split('\t')[0] for x in my_raw3.readlines()]

# with open('../dictionary/combined-malay-dict.txt', 'w', encoding="utf8") as fp:
#     for item in sorted(list(dict.fromkeys(my_raw1 + my_raw2 + my_raw3))):
#         if item:
#             fp.write("%s\n" % item)

with open('../dictionary/combined-malay-dict.txt', encoding="utf8") as fp:
    malay_dict = [x.strip() for x in fp.readlines()]
    
def detect_malay(text): return text in malay_dict
def detect_english(text): return text in nltk.corpus.words.words()

## 1.3 Importing and combining datasets

In [11]:
# # install pandas
# !pip install pandas

import pandas as pd

# importing datasets

# local-new: text + label
df = pd.read_csv(r'../data/malaya/local-news.csv')
df

# semisupervised-bert-xlnet: text + label + prb
df = pd.read_csv(r'../data/malaya/semisupervised-bert-xlnet.csv')
df

# semisupervised-bert-xlnet: text + label + prb
df = pd.read_csv(r'../data/malaya/semisupervised-politics-bert-xlnet.csv')
df

Unnamed: 0,text,label,prob
0,Menggelabah masing-masing nak beraya kan. Lepa...,Negative,0.999371
1,Bendera putih tu bukannya nak harap bantuan ke...,Negative,0.999432
2,Nape nak kena tampal gambar Dia pulak...Guna d...,Negative,0.999458
3,Sudah sudah la kak mas woi. Kamu tahu tak semu...,Negative,0.999358
4,"Kerajaan kita bukan serba boleh, tapi serba bo...",Negative,0.999450
...,...,...,...
23024,@hannahyeoh Terbaik MP Segambut kan gitu... Ka...,Negative,0.999410
23025,Time ph pon byk aset terjual yb.. time tue yb ...,Negative,0.999405
23026,bila jadi pembangkang auto savage hshshshsh ta...,Negative,0.999365
23027,PAN ni pun bangang... Inisiatif bendera putih ...,Negative,0.999460


# 2. Data preprocessing

## 2.1 Data Cleaning

In [3]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    return text
    
clean_text(df['text'][0])

'lebih-lebih lagi dengan  kemudahan internet dan laman sosial taktik ini semakin mudah dikembangkan'

In [4]:
import torch
import modeling_bert

from modeling_bert import BertModel

In [None]:
lang_id2num = { 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }
lang_num2id = {v:k for k,v in lang_id2num.items()}

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text = "She sells"
# if we tokenize it, this becomes:
encoding = tokenizer(text, return_tensors="pt")
encoding['language_ids'] = torch.tensor([[0, 0, 0, 0]])

print(encoding['input_ids'])

for input_id in encoding['input_ids']:
    print(tokenizer.decode(input_id))


In [None]:
model = BertModel.from_pretrained("bert-base-uncased")
outputs = model(**encoding)
print(outputs)