<a href="https://colab.research.google.com/github/megumihoshino/Machine-Learning-Terapan-/blob/main/sentiment_analysis_with_deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch



In [None]:
!pip install torch torchvision
!pip install transformers



In [None]:
!git clone https://github.com/indobenchmark/indonlu

fatal: destination path 'indonlu' already exists and is not an empty directory.


In [None]:
#import libraries

import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader



- set_seed: mengatur dan menetapkan random seed
- count_param: ngitung jml parameter dlm model
- get_lr: ngatur leanring rate
- metrics_to_string: mengonversi metriks ke dlm string

In [None]:
#common functions

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

def count_param(module, trainable = False):
  if trainable:
    return sum(p.numel() for p in module.parameters() if p.requires_grad)
  else:
    return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']

def metrics_to_string(metric_dict):
  string_list = []
  for key, value in metric_dict.items():
    string_list.append('{}:{:.2f}'.format(key, value))
  return ' '.join(string_list)

In [None]:
set_seed(1112024)

**KONFIGURASI DAN PRE-TRAINED MODEL**

LOAD MODEL DAN KONFIGURASI
- di thp ini, digunakan pre-trained moedl Indobert-base-p1 yg mempunyai 124,5 jt parameter.
- model indobert dibangun berdsrkan general-purpose architecture BERT (bidirectiona; encoder representation from transformers). didesain utk memahami arti bhs ambigu dlm teks.

In [None]:
#load tokenizer n config

tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

#instantiate model

model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config = config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [None]:
count_param(model)

124443651

In [None]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

IMPLEMENTASI DATASET DAN DATALOADER DI PYTORCH

In [None]:
from torch.utils.data import Dataset


class DocumentSentimentDataset(Dataset):
  #static constant variable
  LABEL2INDEX = {'positive':0, 'neutral': 1, 'negative':2}
  INDEX2LABEL = {0:'positive', 1:'neutral', 2:'negative'}
  NUM_LABELS = 3 #jml label

  def load_dataset(self, path):
    df = pd.read_csv(path, sep = '\t', header = None)
    df.columns = ['text', 'sentiment'] #kasi nama kolom pd table
    df['sentiment'] = df['sentiment'].apply(lambda lab: self.LABEL2INDEX[lab]) #ngonversi string label ke index
    return df

  def __init__(self, dataset_path, tokenizer, *args, **kwargs):
    self.data = self.load_dataset(dataset_path)

    self.tokenizer = tokenizer

  def __getitem__(self, index):
    data = self.data.loc[index,:]
    text, sentiment = data['text'], data['sentiment']
    subwords = self.tokenizer.encode(text)

    return np.array(subwords), np.array(sentiment), data['text']

  def __len__(self):
    return len(self.data)




SENTIMENT DATA LOADER

In [None]:
from torch.utils.data import DataLoader

class DocumentSentimentDataLoader(DataLoader):
  def __init__(self, max_seq_len = 512, *args, **kwargs):
    super(DocumentSentimentDataLoader, self).__init__(*args, **kwargs)
    self.max_seq_len = max_seq_len      #batas maksimum subword
    self.collate_fn = self._collate_fn  #fingsi collate_fn dgn fungsi yg kita definisikan

  def _collate_fn(self, batch):
    batch_size = len(batch) #batch size
    max_seq_len = max(map(lambda x: len(x[0]), batch)) #panjang subword maks dr batch
    max_seq_len = min(self.max_seq_len, max_seq_len) #compare dgn batas yg ditentukan sblmnya

#buat buffer utk subword, mask, dan sentimen labels, inisialisasi semua dgn 0
    subword_batch = np.zeros((batch_size, max_seq_len), dtype= np.int64)
    mask_batch = np.zeros((batch_size, max_seq_len), dtype = np.float32)
    sentiment_batch = np.zeros((batch_size, 1), dtype = np.int64)

#isi semua buffer
    for i, (subwords, sentiment, raw_seq) in enumerate(batch):
      subwords = subwords[:max_seq_len]
      subword_batch[i,:len(subwords)] = subwords
      mask_batch[i,:len(subwords)] = 1
      sentiment_batch[i,0] = sentiment

#return to subword, mask n sentiment data
    return subword_batch, mask_batch, sentiment_batch



In [None]:
#define variable for both classes

train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase= True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase = True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset = train_dataset, max_seq_len = 512, batch_size = 32, num_workers = 16, shuffle = True)
valid_loader = DocumentSentimentDataLoader(dataset = valid_dataset, max_seq_len = 512, batch_size = 32, num_workers = 16, shuffle = False)
test_loader = DocumentSentimentDataLoader(dataset = test_dataset, max_seq_len = 512, batch_size = 32, num_workers = 16, shuffle = False)




In [None]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [None]:
#define variabel (2)

w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


MENGUJI MODEL DGN CTH KALIMAT

In [None]:
text = 'Senang mempelajari machine learning karena menambah wawasan baru'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1,-1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim = -1)[1].squeeze().item()

print(f'Text:{text} | Label: {i2w[label]} ({F.softmax(logits, dim =-1).squeeze()[label]* 100:.3f}%)')

Text:Senang mempelajari machine learning karena menambah wawasan baru | Label: positive (36.283%)


modelnya msh salah memprediksi sentiment teks yang seharusnya memiliki sentiment positif, maka akan dilakukan proses fine tunning dan evaluasi.

FINE TUNING DAN EVALUASI


In [None]:
optimizer = optim.Adam(model.parameters(), lr = 3e-6)
model = model.cuda()

In [None]:
#TRAIN

n_epochs = 5
for epoch in range(n_epochs):
  model.train()
  torch.set_grad_enabled(True)

  total_train_loss = 0
  list_hyp, list_label = [], []

  train_pbar = tqdm(train_loader, leave = True, total = len(train_loader))
  for i, batch_data in enumerate(train_pbar):
    #forward label
    loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:3], i2w = i2w, device = 'cuda')

    #update model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    tr_loss = loss.item()
    total_train_loss = total_train_loss + tr_loss

    #calculate metrics
    list_hyp += batch_hyp
    list_label += batch_label

    train_pbar.set_description("(Epoch{}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),total_train_loss/(i+1), get_lr(optimizer)))

  #calculate train metric
  metrics = document_sentiment_metrics_fn(list_hyp, list_label)
  print("(Epoch {}) TRAIN LOSS:{:.4f} {} lr: {:.8F}".format((epoch+1), total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

  #evaluate on validation
  model.eval()
  torch.set_grad_enabled(False)

  total_loss, total_correct, total_labels = 0, 0, 0
  list_hyp, list_label = [], []

  pbar = tqdm(valid_loader, leave = True, total = len(valid_loader))
  for i, batch_data in enumerate(pbar):
    batch_seq = batch_data[:3]
    loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:3], i2w = i2w, device = 'cuda')

    #calculate total loss
    valid_loss = loss.item()
    total_loss = total_loss + valid_loss

    #calculate evaluation metrics
    list_hyp += batch_hyp
    list_label += batch_label
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)

    pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

  metrics = document_sentiment_metrics_fn(list_hyp, list_label)
  print("(Epoch{}) VALID LOSS: {:.4f}{}".format((epoch+1), total_loss/(i+1), metrics_to_string(metrics)))

(Epoch1) TRAIN LOSS:0.1616 LR:0.00000300: 100%|██████████| 344/344 [02:44<00:00,  2.10it/s]


(Epoch 1) TRAIN LOSS:0.1616 ACC:0.94 F1:0.93 REC:0.92 PRE:0.93 lr: 0.00000300


VALID LOSS:0.1743 ACC:0.93 F1:0.90 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:07<00:00,  5.13it/s]


(Epoch1) VALID LOSS: 0.1743ACC:0.93 F1:0.90 REC:0.90 PRE:0.92


(Epoch2) TRAIN LOSS:0.1184 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 2) TRAIN LOSS:0.1184 ACC:0.96 F1:0.95 REC:0.95 PRE:0.95 lr: 0.00000300


VALID LOSS:0.1704 ACC:0.94 F1:0.91 REC:0.90 PRE:0.93: 100%|██████████| 40/40 [00:07<00:00,  5.19it/s]


(Epoch2) VALID LOSS: 0.1704ACC:0.94 F1:0.91 REC:0.90 PRE:0.93


(Epoch3) TRAIN LOSS:0.0904 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 3) TRAIN LOSS:0.0904 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 lr: 0.00000300


VALID LOSS:0.1764 ACC:0.94 F1:0.92 REC:0.90 PRE:0.93: 100%|██████████| 40/40 [00:09<00:00,  4.26it/s]


(Epoch3) VALID LOSS: 0.1764ACC:0.94 F1:0.92 REC:0.90 PRE:0.93


(Epoch4) TRAIN LOSS:0.0679 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 4) TRAIN LOSS:0.0679 ACC:0.98 F1:0.97 REC:0.97 PRE:0.97 lr: 0.00000300


VALID LOSS:0.1807 ACC:0.94 F1:0.92 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:07<00:00,  5.11it/s]


(Epoch4) VALID LOSS: 0.1807ACC:0.94 F1:0.92 REC:0.91 PRE:0.92


(Epoch5) TRAIN LOSS:0.0485 LR:0.00000300: 100%|██████████| 344/344 [02:42<00:00,  2.12it/s]


(Epoch 5) TRAIN LOSS:0.0485 ACC:0.99 F1:0.98 REC:0.98 PRE:0.99 lr: 0.00000300


VALID LOSS:0.2098 ACC:0.93 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:08<00:00,  4.92it/s]

(Epoch5) VALID LOSS: 0.2098ACC:0.93 F1:0.91 REC:0.91 PRE:0.92





EVALUASI

In [None]:
#EVALUATE ON TEST
import torch
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader

model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave = True, total = len(test_loader))
for i, batch_data in enumerate(pbar):
  _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:3], i2w = i2w, device = 'cuda')
  list_hyp += batch_hyp

#saving prediction

df = pd.DataFrame({'label': list_hyp}).reset_index()
df.to_csv('pred.txt', index = False)

print(df)




100%|██████████| 16/16 [00:02<00:00,  5.79it/s]

     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497   neutral
498    498  positive
499    499  positive

[500 rows x 2 columns]





In [None]:
#PREDIKSI SENTIMENT

text = 'Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1,-1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim = -1)[1].squeeze().item()

print(f'Text:{text} | Label: {i2w[label]} ({F.softmax(logits, dim =-1).squeeze()[label]* 100:.3f}%)')

Text:Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi | Label: negative (99.836%)
