# ANALISIS SENTIMEN POSITIF NEGATIF PADA BAHASA INDONESIA

## Persiapan

In [2]:
pip install torch torchvision



In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.11.2-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 2.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 48.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 19.9 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.9 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.18-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Collecting ruamel.yaml==0.17.16
  Downloading ruamel.yaml-0.17.16-py3-none-any.whl (109 kB)
[K     |████████████████████████████████| 109 kB 49.1 MB/s 
[?25hCollec

In [4]:
!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 427, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 427 (delta 14), reused 30 (delta 14), pack-reused 394[K
Receiving objects: 100% (427/427), 4.83 MiB | 10.91 MiB/s, done.
Resolving deltas: 100% (209/209), done.


## Import Library and Set Up

In [5]:
# Import library
import random
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch import optim

from tqdm import tqdm
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataLoader, DocumentSentimentDataset

In [6]:
# Set up 

# Fungsi untuk mengatur dan menetapkan random seed
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)

# Fungsi untuk menghitung jumlah parameter dalam model
def count_param(module, trainable = False):
  if trainable:
    return sum(p.numel() for p in module.parameters() if p.requires_grad)
  else:
    return sum(p.numel() for p in module.parameters())

# Fungsi untuk mengatur learning rate
def get_lr(optimizer):
  for param_group in optimizer.param_groups:
    return param_group['lr']

# Fungsi untuk mengkonversi metriks ke string
def metrics_to_string(metric_dict):
  string_list = []
  for key, value in metric_dict.items():
    string_list.append('{}:{:.2f}'.format(key, value))
  return ' '.join(string_list)

In [7]:
# Set random seed
set_seed(5102021)

## Load Model and Configuration

In [8]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config = config)

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [10]:
# Menghitung jumlah parameter pada model
count_param(model)

124443651

## Load Dataset

In [11]:
train_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = '/content/indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

In [12]:
train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

In [13]:
train_loader = DocumentSentimentDataLoader(dataset = train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset = valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
test_loader = DocumentSentimentDataLoader(dataset = test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)

  cpuset_checked))


In [14]:
print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')


In [15]:
w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


## Menguji Model

In [16]:
text = 'Bahagia hatiku melihat perhikahan putri sulungku yang cantik jelita'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat perhikahan putri sulungku yang cantik jelita | Label : negative (45.618%)


## Tuning Model

In [17]:
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [18]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)
 
    total_train_loss = 0
    list_hyp, list_label = [], []
 
    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
 
        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
 
        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss
 
        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label
 
        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))
 
    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))
 
    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)
    
    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []
 
    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
        
        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss
 
        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)
 
        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

  cpuset_checked))
(Epoch 1) TRAIN LOSS:0.3281 LR:0.00000300: 100%|██████████| 344/344 [05:02<00:00,  1.14it/s]


(Epoch 1) TRAIN LOSS:0.3281 ACC:0.88 F1:0.83 REC:0.81 PRE:0.87 LR:0.00000300


VALID LOSS:0.1923 ACC:0.93 F1:0.90 REC:0.89 PRE:0.91: 100%|██████████| 40/40 [00:14<00:00,  2.83it/s]


(Epoch 1) VALID LOSS:0.1923 ACC:0.93 F1:0.90 REC:0.89 PRE:0.91


(Epoch 2) TRAIN LOSS:0.1555 LR:0.00000300: 100%|██████████| 344/344 [05:03<00:00,  1.13it/s]


(Epoch 2) TRAIN LOSS:0.1555 ACC:0.95 F1:0.93 REC:0.93 PRE:0.94 LR:0.00000300


VALID LOSS:0.1675 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:14<00:00,  2.78it/s]


(Epoch 2) VALID LOSS:0.1675 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92


(Epoch 3) TRAIN LOSS:0.1142 LR:0.00000300: 100%|██████████| 344/344 [05:04<00:00,  1.13it/s]


(Epoch 3) TRAIN LOSS:0.1142 ACC:0.96 F1:0.95 REC:0.95 PRE:0.96 LR:0.00000300


VALID LOSS:0.1736 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:14<00:00,  2.82it/s]


(Epoch 3) VALID LOSS:0.1736 ACC:0.94 F1:0.91 REC:0.90 PRE:0.92


(Epoch 4) TRAIN LOSS:0.0854 LR:0.00000300: 100%|██████████| 344/344 [05:03<00:00,  1.13it/s]


(Epoch 4) TRAIN LOSS:0.0854 ACC:0.97 F1:0.97 REC:0.96 PRE:0.97 LR:0.00000300


VALID LOSS:0.1933 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:14<00:00,  2.79it/s]


(Epoch 4) VALID LOSS:0.1933 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92


(Epoch 5) TRAIN LOSS:0.0669 LR:0.00000300: 100%|██████████| 344/344 [05:04<00:00,  1.13it/s]


(Epoch 5) TRAIN LOSS:0.0669 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


VALID LOSS:0.1930 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92: 100%|██████████| 40/40 [00:14<00:00,  2.79it/s]

(Epoch 5) VALID LOSS:0.1930 ACC:0.94 F1:0.91 REC:0.91 PRE:0.92





In [19]:
model.eval()
torch.set_grad_enabled(False)
 
total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []
 
pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp
 
# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)
 
print(df)

  cpuset_checked))
100%|██████████| 16/16 [00:05<00:00,  3.10it/s]

     index     label
0        0  negative
1        1  positive
2        2  negative
3        3  negative
4        4  positive
..     ...       ...
495    495  negative
496    496  positive
497    497  positive
498    498  negative
499    499   neutral

[500 rows x 2 columns]





## Uji Hasil Tuning

In [20]:
text = ['Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita',
        'Ronaldo pergi ke Mall Grand Indonesia membeli cilok',
        'Sayang, aku marah',
        'Merasa kagum dengan toko ini tapi berubah kecewa setelah transaksi']
for t in text:
  subwords = tokenizer.encode(t)
  subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
 
  logits = model(subwords)[0]
  label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()
 
  print(f'Text: {t} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Bahagia hatiku melihat pernikahan putri sulungku yang cantik jelita | Label : positive (99.520%)
Text: Ronaldo pergi ke Mall Grand Indonesia membeli cilok | Label : neutral (99.421%)
Text: Sayang, aku marah | Label : negative (99.561%)
Text: Merasa kagum dengan toko ini tapi berubah kecewa setelah transaksi | Label : negative (99.613%)
