# 1. Importing libraries and datasets

## 1.1 Importing torch

In [None]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## 1.2 Importing language detection

- For Malay, we use dictionaries from IPA-Dict and Dewan Bahasa.
- For English, we use NLTK corpus.

In [85]:
import json
import nltk

# my_raw1 = json.load(open('../dictionary/200k-english-malay.json'))
# my_raw2 = open('../dictionary/en-ms.txt', encoding="utf8")
# my_raw3 = open('../dictionary/malay-ipa-dict.txt', encoding="utf8")

# my_raw1 = [x[1] for x in my_raw1]
# my_raw2 = [x.split('\t')[1].strip() for x in my_raw2.readlines()]
# my_raw3 = [x.split('\t')[0] for x in my_raw3.readlines()]

# with open('../dictionary/combined-malay-dict.txt', 'w', encoding="utf8") as fp:
#     for item in sorted(list(dict.fromkeys(my_raw1 + my_raw2 + my_raw3))):
#         if item:
#             fp.write("%s\n" % item)

with open('../dictionary/combined-malay-dict.txt', encoding="utf8") as fp:
    malay_dict = [x.strip() for x in fp.readlines()]
    
def detect_malay(text): return text in malay_dict
def detect_english(text): return text in nltk.corpus.words.words()

## 1.3 Importing and combining datasets

In [203]:
# # install pandas
# !pip install pandas

import pandas as pd
combined_df = pd.DataFrame()

# local-new: text + label*
df = pd.read_csv(r'../data/malaya/local-news.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'../data/malaya/semisupervised-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'../data/malaya/semisupervised-politics-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# supervised-data: text + sentiment* 
df = pd.read_csv(r'../data/malaya/supervised-data.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'../data/malaya/supervised-data-politics.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

combined_df = combined_df.reset_index()[['text', 'sentiment']]
combined_df

Unnamed: 0,text,sentiment
0,Lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,Adalah membingungkan mengapa masyarakat Cina b...,Negative
3,Kami menurunkan defisit daripada 6.7 peratus p...,Positive
4,"Ini masalahnya. Bukan rakyat, tetapi sistem",Negative
...,...,...
215513,"Kerajaan kita bukan serba boleh, tapi serba bo...",Negative
215514,Sudah sudah la kak mas woi. Kamu tahu tak semu...,Negative
215515,Nape nak kena tampal gambar Dia pulak...Guna d...,Negative
215516,Bendera putih tu bukannya nak harap bantuan ke...,Negative


# 2. Data preprocessing

## 2.1 Data Cleaning

In [207]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
    
combined_df['text'] = combined_df['text'].apply(lambda x: clean_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
215513,kerajaan kita bukan serba boleh tapi serba bod...,Negative
215514,sudah sudah la kak mas woi kamu tahu tak semua...,Negative
215515,nape nak kena tampal gambar dia pulakguna duit...,Negative
215516,bendera putih tu bukannya nak harap bantuan ke...,Negative


## 2.2 Normalise short-form words

In [220]:
malaya_sf = pd.read_csv(r'../normalise/malaya.csv')
cilisos_sf = pd.read_csv(r'../normalise/cilisos.csv', encoding='ISO-8859-1')

combined_sf = {x[0]: x[1] for x in malaya_sf.values.tolist() + cilisos_sf.values.tolist()}

def normalise_text(text):
    return ' '.join([combined_sf[x] if x in combined_sf.keys() else x for x in text.split()])

combined_df['text'] = combined_df['text'].apply(lambda x: normalise_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet dan...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada ...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
215513,kerajaan kita bukan serba boleh tetapi serba b...,Negative
215514,sudah sudah lah kak mas woi kamu tahu tidak se...,Negative
215515,kenapa hendak kena tampal gambar dia pulakguna...,Negative
215516,bendera putih itu bukannya hendak harap bantua...,Negative


In [4]:
import torch
import modeling_bert

from modeling_bert import BertModel

In [None]:
lang_id2num = { 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }
lang_num2id = {v:k for k,v in lang_id2num.items()}

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
text = "She sells"
# if we tokenize it, this becomes:
encoding = tokenizer(text, return_tensors="pt")
encoding['language_ids'] = torch.tensor([[0, 0, 0, 0]])

print(encoding['input_ids'])

for input_id in encoding['input_ids']:
    print(tokenizer.decode(input_id))


In [None]:
model = BertModel.from_pretrained("bert-base-uncased")
outputs = model(**encoding)
print(outputs)