# 1. Importing libraries and datasets

## 1.1 Importing torch

In [1]:
import torch
# check if we have cuda installed
if torch.cuda.is_available():
    # to use GPU
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('GPU is:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## 1.2 Importing language detection

- For Malay, we use dictionaries from IPA-Dict and Dewan Bahasa.
- For English, we use NLTK corpus.

In [33]:
import json
import nltk

# my_raw1 = json.load(open('../dictionary/200k-english-malay.json'))
# my_raw2 = open('../dictionary/en-ms.txt', encoding="utf8")
# my_raw3 = open('../dictionary/malay-ipa-dict.txt', encoding="utf8")

# my_raw1 = [x[1] for x in my_raw1]
# my_raw2 = [x.split('\t')[1].strip() for x in my_raw2.readlines()]
# my_raw3 = [x.split('\t')[0] for x in my_raw3.readlines()]

# with open('../dictionary/combined-malay-dict.txt', 'w', encoding="utf8") as fp:
#     for item in sorted(list(dict.fromkeys(my_raw1 + my_raw2 + my_raw3))):
#         if item:
#             fp.write("%s\n" % item)

with open('../dictionary/combined-malay-dict.txt', encoding="utf8") as fp:
    malay_dict = set([x.strip() for x in fp.readlines()])
    
eng_dict = set(nltk.corpus.words.words())
    
def detect_malay(text): return text in malay_dict
def detect_english(text): return text in eng_dict

## 1.3 Importing and combining datasets

In [17]:
# # install pandas
# !pip install pandas

import pandas as pd
combined_df = pd.DataFrame()

# local-new: text + label*
df = pd.read_csv(r'../data/malaya/local-news.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'../data/malaya/semisupervised-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# semisupervised-bert-xlnet: text + label*
df = pd.read_csv(r'../data/malaya/semisupervised-politics-bert-xlnet.csv')
combined_df = pd.concat([combined_df, df[['text', 'label']].rename(columns={'label': 'sentiment'})])

# supervised-data: text + sentiment* 
df = pd.read_csv(r'../data/malaya/supervised-data.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'../data/malaya/supervised-data-politics.csv', sep='\t')
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# supervised-data-politics: text + sentiment* 
df = pd.read_csv(r'../data/malaya/manglish.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

# twitter: text + sentiment* 
df = pd.read_csv(r'../data/scrapping/twitter.csv', sep='\t')
df['sentiment'] = df['sentiment'].apply(lambda x: x.capitalize())
combined_df = pd.concat([combined_df, df[['text', 'sentiment']]])

combined_df = combined_df.reset_index()[['text', 'sentiment']]
combined_df

Unnamed: 0,text,sentiment
0,Lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,Adalah membingungkan mengapa masyarakat Cina b...,Negative
3,Kami menurunkan defisit daripada 6.7 peratus p...,Positive
4,"Ini masalahnya. Bukan rakyat, tetapi sistem",Negative
...,...,...
543662,Boxi in Alliance : 🤡\nBoxi in Liquid : 🗿,Positive
543663,Liquid tolong pause sat 🤣,Neutral
543664,The last time dine-in here was on the day @Nig...,Positive
543665,Meriah lower bracket liquid pun jatuh huhu,Negative


# 2. Data preprocessing

## 2.1 Data Cleaning

In [18]:
from unidecode import unidecode
import string
import re

punctuation = '‘’“”!$%&\()*+,./:;<=>?[\\]^_`{|}~•@…'

def clean_text(text):
    # convert characters to ascii
    text = unidecode(text)
    
    # remove words that is hashtags, mentions and links
    text = re.sub(r'^([@#]|http|https)[^\s]*', '', text)
    
    # remove punctuation
    text = text.translate(text.maketrans('', '', punctuation))
    
    # remove next line     
    text = re.sub('\n', '', text)
    
    # lowercasing text
    text = text.lower()
    
    # stripping text
    text = text.strip()
    
    # remove words containing numbers
    text = re.sub('\w*\d\w*', '', text)
    
    return text
    
combined_df['text'] = combined_df['text'].apply(lambda x: clean_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet da...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
543662,boxi in alliance boxi in liquid,Positive
543663,liquid tolong pause sat,Neutral
543664,the last time dine-in here was on the day nigm...,Positive
543665,meriah lower bracket liquid pun jatuh huhu,Negative


## 2.2 Normalise short-form words

In [19]:
malaya_sf = pd.read_csv(r'../normalise/malaya.csv')
cilisos_sf = pd.read_csv(r'../normalise/cilisos.csv', encoding='ISO-8859-1')

combined_sf = {x[0]: x[1] for x in malaya_sf.values.tolist() + cilisos_sf.values.tolist()}

def normalise_text(text):
    return ' '.join([combined_sf[x] if x in combined_sf.keys() else x for x in text.split()])

combined_df['text'] = combined_df['text'].apply(lambda x: normalise_text(x))
combined_df

Unnamed: 0,text,sentiment
0,lebih-lebih lagi dengan kemudahan internet dan...,Negative
1,boleh memberi teguran kepada parti tetapi perl...,Positive
2,adalah membingungkan mengapa masyarakat cina b...,Negative
3,kami menurunkan defisit daripada peratus pada ...,Positive
4,ini masalahnya bukan rakyat tetapi sistem,Negative
...,...,...
543662,boxi in alliance boxi in liquid,Positive
543663,liquid tolong pause sabtu,Neutral
543664,the akhir masa dine-in here was on the hari ni...,Positive
543665,meriah lower bracket liquid pun jatuh huhu,Negative


## 2.3 Randomly sample datasets

In [None]:
combined_df_all = combined_df

In [25]:
combined_df = combined_df_all.sample(n=50000).reset_index()[['text', 'sentiment']]
combined_df

Unnamed: 0,text,sentiment
0,looks like scum kepada me,Positive
1,perhatian status aras air semasa di sarawak mi...,Neutral
2,change the heart of china kemudian,Negative
3,can rebate further dengan gc and shopback,Positive
4,aku baru habis tengok padman,Neutral
...,...,...
49995,haha yeah that's the plan,Negative
49996,and only available untuk porting in,Positive
49997,macam ini sudah pun tetapi ahli politik still ...,Negative
49998,dengan kemudian ish they laugh pada us,Negative


# 3. Get feature and targets

## 3.1 Load tokenizer

In [26]:
model_name = "bert-base-multilingual-cased"

In [27]:
import torch
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained(model_name)

## 3.2 Testing datasets and modeling

In [None]:
# text = "She sells"
# # if we tokenize it, this becomes:
# encoding = tokenizer(text, return_tensors="pt")
# encoding['language_ids'] = torch.tensor([[0, 1, 1, 0]])

# print(encoding['input_ids'])

# for input_id in encoding['input_ids']:
#     print(tokenizer.decode(input_id))


In [None]:
# # tokenize the text feature 
# tokenized_feature_raw = tokenizer.batch_encode_plus(
#     # Sentences to encode
#     combined_df.text.values.tolist(), 
#     # Add '[CLS]' and '[SEP]'
#     add_special_tokens = True
# )

# # collect tokenized sentence length
# token_sentence_length = [len(x) for x in tokenized_feature_raw['input_ids']]
# print('max: ', max(token_sentence_length))
# print('min: ', min(token_sentence_length))

# # plot the distribution
# import matplotlib.pyplot as plt
# plt.figure(figsize=(20, 8))
# plt.hist(token_sentence_length, rwidth = 0.9)
# plt.xlabel('Sequence Length', fontsize = 18)
# plt.ylabel('# of Samples', fontsize = 18)
# plt.xticks(fontsize = 14)
# plt.yticks(fontsize = 14)

## 3.3 Prepare features and target

In [28]:
# identify features and target
features = combined_df.text.values.tolist()
target = combined_df.sentiment.values.tolist()

print(features[:5])

['looks like scum kepada me', 'perhatian status aras air semasa di sarawak miri marudi aras air sungai adalah iait', 'change the heart of china kemudian', 'can rebate further dengan gc and shopback', 'aku baru habis tengok padman']


In [29]:
# tokenize features 
MAX_LEN = 128
tokenized_feature = tokenizer.batch_encode_plus(
    # Sentences to encode
    features, 
    # Add '[CLS]' and '[SEP]'
    add_special_tokens = True,
    # Add empty tokens if len(text)<MAX_LEN
    padding = 'max_length',
    # Truncate all sentences to max length
    truncation=True,
    # Set the maximum length
    max_length = MAX_LEN, 
    # Return attention mask
    return_attention_mask = True,
    # Return pytorch tensors
    return_tensors = 'pt'       
)

## 3.4 Add language embeddings

In [30]:
special_token_list = set(['[CLS]', '[SEP]', '[PAD]', '[UNK]'])

lang_id2num = { 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }
lang_num2id = {v:k for k,v in lang_id2num.items()}

def detect_lang(text):
    if text in special_token_list:
        return 'special_token'
    elif detect_malay(text):
        return 'malay'
    elif detect_english(text):
        return 'english'
    else:
        return 'other'

In [34]:
# detect language function is on section 1

def get_lang_tokens(input_tokens):
    decoded_tokens = [x.replace(' ', '') for x in reversed(tokenizer.batch_decode(input_tokens))]
    language_ids = []
    
    full_sentence = ''
    token_count = 0
    for token in decoded_tokens:
        if '##' in token:
            full_sentence = token[2:] + full_sentence
            token_count += 1
            continue

        full_sentence = token + full_sentence
        token_count += 1
        lang_token = lang_id2num[detect_lang(full_sentence)]
        for _ in range(token_count):
            language_ids.append(lang_token)
        
        full_sentence = ''
        token_count = 0
        
    return language_ids

In [35]:
# { 'special_token': 0, 'english': 1, 'malay': 2, 'other': 3 }

language_ids = []
for input_id in tokenized_feature['input_ids']:
    input_tokens = input_id.tolist()
    
    language_ids.append(get_lang_tokens(input_tokens))

language_ids = torch.tensor(language_ids)

tokenized_feature['language_ids'] = language_ids

tokenized_feature['language_ids']

tensor([[0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 1, 0],
        ...,
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0],
        [0, 0, 0,  ..., 2, 2, 0]])

## 3.5 Numeric target

In [38]:
# # install sklearn 
# !pip install sklearn

# convert label into numeric 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(target)
target_num = le.transform(target)

print(target_num[:20])

[2 1 0 2 1 1 0 0 2 1 0 0 2 1 1 1 1 2 1 1]


## 3.6 Pickle files

In [40]:
import pickle
pickle_path = '../pickle/bert+li_feature_target.pickle'

feature_target = [tokenized_feature, target, target_num]

with open(pickle_path, 'wb') as fp:
    pickle.dump(feature_target, fp)

# 4 Prepare train and validation set

In [41]:
import pickle
with open(pickle_path, 'rb') as fp:
    tokenized_feature, target, target_num = pickle.load(fp)

In [42]:
# Use 80% for training and 20% for validation
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks, train_langs, validation_langs = train_test_split(tokenized_feature['input_ids'], 
                                                                                                                                                  target_num,
                                                                                                                                                  tokenized_feature['attention_mask'],
                                                                                                                                                  tokenized_feature['language_ids'],
                                                                                                                                                  random_state=2018,
                                                                                                                                                  test_size=0.2,
                                                                                                                                                  stratify=target)

In [43]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler

# define batch_size
batch_size = 16
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, train_langs, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our test set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_langs, torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# 5 Settings up BERT Model

In [44]:
# BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from modeling_bert import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    # Specify number of classes
    num_labels = len(set(target)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False
)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [45]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(model_name)

# Because we add two words [‘covid’, ‘coronavirus’] into the vocabulary
# we will need to resize the token to make sure the model pick it up as whole words.

# Receive the full size of the new word
model.resize_token_embeddings(len(tokenizer))

# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )



In [46]:
# Number of training epochs
epochs = 4
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [47]:
# use cuda if existing
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (language_embeddings): Embedding(119547, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
          

In [48]:
# Training
import time
# Store the average loss after each epoch 
loss_values = []
# number of total steps for each epoch
print('total steps per epoch: ',  len(train_dataloader) / batch_size)
# looping over epochs
for epoch_i in range(0, epochs):
    
    print('training on epoch: ', epoch_i)
    # set start time 
    t0 = time.time()
    # reset total loss
    total_loss = 0
    # model in training 
    model.train()
    # loop through batch 
    for step, batch in enumerate(train_dataloader):
        # Progress update every 50 step 
        if step % 50 == 0 and not step == 0:
            print('training on step: ', step)
            print('total time used is: {0:.2f} s'.format(time.time() - t0))
        # load data from dataloader 
        b_input_ids = batch[0].to(device).long()
        b_input_mask = batch[1].to(device).long()
        b_input_langs = batch[2].to(device).long()
        b_labels = batch[3].to(device).long()
        # clear any previously calculated gradients 
        model.zero_grad()
        # get outputs
        outputs = model(input_ids=b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        language_ids=b_input_langs,
                        labels=b_labels)
        # get loss
        loss = outputs[0]
        # total loss
        total_loss += loss.item()
        # clip the norm of the gradients to 1.0.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # update optimizer
        optimizer.step()
        # update learning rate 
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("average training loss: {0:.2f}".format(avg_train_loss))

total steps per epoch:  156.25
training on epoch:  0
training on step:  50
total time used is: 152.83 s
training on step:  100
total time used is: 312.08 s


KeyboardInterrupt: 

In [None]:
# Test
import numpy as np
t0 = time.time()
# model in validation mode
model.eval()
# save prediction
predictions,true_labels =[],[]
# evaluate data for one epoch
for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_input_langs, b_labels = batch
    # validation
    with torch.no_grad():
        outputs = model(input_ids=b_input_ids.long(),
                        token_type_ids=None,
                        attention_mask=b_input_mask.long(),
                        language_ids=b_input_langs.long(),
                        labels=b_labels.long())
    # get output
    logits = outputs[0]
    # move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    final_prediction = np.argmax(logits, axis=-1).flatten()
    predictions.append(final_prediction)
    true_labels.append(label_ids)
    
print('total time used is: {0:.2f} s'.format(time.time() - t0))

In [None]:
predictions

In [None]:
# convert numeric label to string
final_prediction_list = le.inverse_transform(np.concatenate(predictions))
final_truelabel_list = le.inverse_transform(np.concatenate(true_labels))

In [None]:
final_prediction_list

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
cr = classification_report(final_truelabel_list, 
                           final_prediction_list, 
                           output_dict=False)
print(cr)

# 6 Confusion Matrix

In [None]:
# # install seaborn
# !pip install seaborn

# print cm heatmap
import seaborn as sns
# return class label
class_label = np.unique(final_truelabel_list)
# get confusion matrix 
cm = confusion_matrix(final_truelabel_list,
                      final_prediction_list)
# covert it to dataframe for plotting
cm_df = pd.DataFrame(cm, 
                     index = class_label, 
                     columns = class_label)
# plot it 
plt.figure(figsize = (10,8))
g = sns.heatmap(cm_df, cmap = 'hot_r', annot=True, fmt='g')
g.xaxis.set_ticks_position("top")
g.tick_params(axis='x', rotation=90)
g.set_xlabel("True Sentiment Label")
g.set_ylabel("Predicted Sentiment Label")