# Sentiment Analysis with Deep Learning

### Import

In [2]:
pip install torch torchvision



In [3]:
pip install transformers



In [4]:
# clone akun github IndoNLU untuk menyimpan dataset pada storage session Google Colab

!git clone https://github.com/indobenchmark/indonlu

Cloning into 'indonlu'...
remote: Enumerating objects: 500, done.[K
remote: Counting objects: 100% (184/184), done.[K
remote: Compressing objects: 100% (74/74), done.[K
remote: Total 500 (delta 115), reused 139 (delta 110), pack-reused 316 (from 1)[K
Receiving objects: 100% (500/500), 9.45 MiB | 5.15 MiB/s, done.
Resolving deltas: 100% (235/235), done.


In [5]:
import random
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer
from nltk.tokenize import TweetTokenizer

from indonlu.utils.forward_fn import forward_sequence_classification
from indonlu.utils.metrics import document_sentiment_metrics_fn
from indonlu.utils.data_utils import DocumentSentimentDataset, DocumentSentimentDataLoader

__Common Functions__

In [6]:
# Mengatur dan menetapkan random seed

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [7]:
# Menghitung jumlah parameter dalam model

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

In [8]:
# Mengatur learning rate

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

In [9]:
# Mengonversi metriks ke dalam string

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [10]:
# Menetwapkan seed

set_seed(1012005)

### Load and Config Pre-Trained Model

Pada tahap ini, digunakan pre-trained model Indobert-base-p1 yang memiliki 124.5 juta parameter.

Model Indobert dibangun berdasarkan general-purpose architecture BERT (Bidirectional Encoder Representation from Transformers).  BERT didesain untuk membantu komputer memahami arti bahasa ambigu dalam teks. Caranya adalah menggunakan teks di sekitarnya untuk membangun konteks.

In [11]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = DocumentSentimentDataset.NUM_LABELS

# Instantiate model
model = BertForSequenceClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/229k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.53k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
count_param(model)

124443651

### Data Preparation

In [14]:
train_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/train_preprocess.tsv'
valid_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/valid_preprocess.tsv'
test_dataset_path = 'indonlu/dataset/smsa_doc-sentiment-prosa/test_preprocess_masked_label.tsv'

Lokasi dataset telah ditentukan, selanjutnya siapkan data dengan Pytorch. PyTorch menyediakan cara terstandarisasi untuk menyiapkan data sebelum melakukan pemodelan. PyTorch menyediakan banyak fitur canggih untuk memproses data.

Di sini, digunakan 2 kelas yang disediakan di PyTorch dalam modul `torch.utils.data` yaitu `Dataset` dan `DataLoader`. Kelas Dataset adalah sebuah abstract class yang perlu dilakukan extend di PyTorch. Sedangkan, DataLoader adalah inti dari perangkat pemrosesan data di PyTorch. DataLoader menyediakan banyak fungsionalitas untuk mempersiapkan data termasuk berbagai metode sampling, komputasi paralel, dan pemrosesan terdistribusi. Pindahkan objek dari kelas Dataset ke dalam objek dari kelas DataLoader untuk pemrosesan batch data lebih lanjut.

Untuk menunjukkan bagaimana cara mengimplementasikan Dataset dan DataLoader di PyTorch, dapat dilihat lebih dalam pada kelas `DocumentSentimentDataset` dan DocumentSentimentDataLoader yang disediakan oleh IndoNLU.

Selanjutnya, implementasikan kelas `DocumentSentimentDataset` untuk data loading. Untuk membuat kelas DocumentSentimentDataset yang fungsional, implementasikan 3 fungsi berikut, `__init__(self, ...)`,` __getitem__(self, index)`, dan `__len__(self)`.

In [15]:
# Defining variable

train_dataset = DocumentSentimentDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = DocumentSentimentDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = DocumentSentimentDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = DocumentSentimentDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=True)
valid_loader = DocumentSentimentDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)
test_loader = DocumentSentimentDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=32, num_workers=16, shuffle=False)

print(train_dataset[0])

(array([    2,  6540,    92,  2970,   213,  4259,  3553,   899,    34,
         259,  5590,   262,  2558,   386,   899,  1687,    26,  1574,
       30470,   899,  3310, 30468, 22130, 30360,  6123,  6368, 30468,
       22130, 30360,  2652,  1746, 30468,  8869,  6540,    34,  6315,
        1622,  1256,  8949,   899, 30468,  4222,  1622,   752,   245,
         295,  2083, 30470,  2346,  7107,   300, 30470,   405,   724,
        5189, 30470,   843, 17464,   899,   540, 10989,  3331,  1107,
       30468,   119,  3221,    79,    34,  2170,    98,  9167, 30457,
           3]), array(0), 'warung ini dimiliki oleh pengusaha pabrik tahu yang sudah puluhan tahun terkenal membuat tahu putih di bandung . tahu berkualitas , dipadu keahlian memasak , dipadu kretivitas , jadilah warung yang menyajikan menu utama berbahan tahu , ditambah menu umum lain seperti ayam . semuanya selera indonesia . harga cukup terjangkau . jangan lewatkan tahu bletoka nya , tidak kalah dengan yang asli dari tegal !')




In [16]:
# word to index and index to word

w2i, i2w = DocumentSentimentDataset.LABEL2INDEX, DocumentSentimentDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'positive': 0, 'neutral': 1, 'negative': 2}
{0: 'positive', 1: 'neutral', 2: 'negative'}


### Pre-Testing Model

In [17]:
text = 'Raut bahagia terpancar dari wajahku ketika melihat dirimu'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Raut bahagia terpancar dari wajahku ketika melihat dirimu | Label : positive (40.081%)


### Fine Tuning and Evaluation

In [18]:
# Set optimizer
optimizer = optim.Adam(model.parameters(), lr=3e-6)
model = model.cuda()

In [19]:
# Train
n_epochs = 5
for epoch in range(n_epochs):
    model.train()
    torch.set_grad_enabled(True)

    total_train_loss = 0
    list_hyp, list_label = [], []

    train_pbar = tqdm(train_loader, leave=True, total=len(train_loader))
    for i, batch_data in enumerate(train_pbar):
        # Forward model
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Update model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_loss = loss.item()
        total_train_loss = total_train_loss + tr_loss

        # Calculate metrics
        list_hyp += batch_hyp
        list_label += batch_label

        train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), get_lr(optimizer)))

    # Calculate train metric
    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
        total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))

    # Evaluate on validation
    model.eval()
    torch.set_grad_enabled(False)

    total_loss, total_correct, total_labels = 0, 0, 0
    list_hyp, list_label = [], []

    pbar = tqdm(valid_loader, leave=True, total=len(valid_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]
        loss, batch_hyp, batch_label = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')

        # Calculate total loss
        valid_loss = loss.item()
        total_loss = total_loss + valid_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        metrics = document_sentiment_metrics_fn(list_hyp, list_label)

        pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))

    metrics = document_sentiment_metrics_fn(list_hyp, list_label)
    print("(Epoch {}) VALID LOSS:{:.4f} {}".format((epoch+1),
        total_loss/(i+1), metrics_to_string(metrics)))

(Epoch 1) TRAIN LOSS:0.3188 LR:0.00000300: 100%|██████████| 344/344 [02:33<00:00,  2.24it/s]


(Epoch 1) TRAIN LOSS:0.3188 ACC:0.89 F1:0.84 REC:0.82 PRE:0.88 LR:0.00000300


VALID LOSS:0.1901 ACC:0.93 F1:0.90 REC:0.90 PRE:0.90: 100%|██████████| 40/40 [00:07<00:00,  5.44it/s]


(Epoch 1) VALID LOSS:0.1901 ACC:0.93 F1:0.90 REC:0.90 PRE:0.90


(Epoch 2) TRAIN LOSS:0.1576 LR:0.00000300: 100%|██████████| 344/344 [02:33<00:00,  2.25it/s]


(Epoch 2) TRAIN LOSS:0.1576 ACC:0.95 F1:0.93 REC:0.92 PRE:0.93 LR:0.00000300


VALID LOSS:0.1735 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91: 100%|██████████| 40/40 [00:08<00:00,  4.73it/s]


(Epoch 2) VALID LOSS:0.1735 ACC:0.93 F1:0.91 REC:0.90 PRE:0.91


(Epoch 3) TRAIN LOSS:0.1172 LR:0.00000300: 100%|██████████| 344/344 [02:35<00:00,  2.21it/s]


(Epoch 3) TRAIN LOSS:0.1172 ACC:0.96 F1:0.95 REC:0.95 PRE:0.96 LR:0.00000300


VALID LOSS:0.1743 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93: 100%|██████████| 40/40 [00:09<00:00,  4.11it/s]


(Epoch 3) VALID LOSS:0.1743 ACC:0.94 F1:0.91 REC:0.89 PRE:0.93


(Epoch 4) TRAIN LOSS:0.0933 LR:0.00000300: 100%|██████████| 344/344 [02:36<00:00,  2.20it/s]


(Epoch 4) TRAIN LOSS:0.0933 ACC:0.97 F1:0.96 REC:0.96 PRE:0.97 LR:0.00000300


VALID LOSS:0.1761 ACC:0.94 F1:0.92 REC:0.91 PRE:0.93: 100%|██████████| 40/40 [00:07<00:00,  5.54it/s]


(Epoch 4) VALID LOSS:0.1761 ACC:0.94 F1:0.92 REC:0.91 PRE:0.93


(Epoch 5) TRAIN LOSS:0.0685 LR:0.00000300: 100%|██████████| 344/344 [02:32<00:00,  2.26it/s]


(Epoch 5) TRAIN LOSS:0.0685 ACC:0.98 F1:0.97 REC:0.97 PRE:0.98 LR:0.00000300


VALID LOSS:0.2175 ACC:0.93 F1:0.91 REC:0.90 PRE:0.92: 100%|██████████| 40/40 [00:07<00:00,  5.55it/s]

(Epoch 5) VALID LOSS:0.2175 ACC:0.93 F1:0.91 REC:0.90 PRE:0.92





In [20]:
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)
# Evaluate on test
model.eval()
torch.set_grad_enabled(False)

total_loss, total_correct, total_labels = 0, 0, 0
list_hyp, list_label = [], []

pbar = tqdm(test_loader, leave=True, total=len(test_loader))
for i, batch_data in enumerate(pbar):
    _, batch_hyp, _ = forward_sequence_classification(model, batch_data[:-1], i2w=i2w, device='cuda')
    list_hyp += batch_hyp

# Save prediction
df = pd.DataFrame({'label':list_hyp}).reset_index()
df.to_csv('pred.txt', index=False)

print(df)

100%|██████████| 16/16 [00:02<00:00,  6.67it/s]


     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497   neutral
498    498  positive
499    499  positive

[500 rows x 2 columns]


100%|██████████| 16/16 [00:03<00:00,  5.19it/s]

     index     label
0        0  negative
1        1  negative
2        2  negative
3        3  negative
4        4  negative
..     ...       ...
495    495   neutral
496    496   neutral
497    497   neutral
498    498  positive
499    499  positive

[500 rows x 2 columns]





### Predict Sentiment

In [21]:
# Melihat kembali prediksi kalimat setelah train model

text = 'Raut bahagia terpancar dari wajahku ketika melihat dirimu'
subwords = tokenizer.encode(text)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

logits = model(subwords)[0]
label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: Raut bahagia terpancar dari wajahku ketika melihat dirimu | Label : positive (90.849%)


1. “El Gasing pergi ke Mall Grand Indonesia membeli cilok”
2. “Sayang, aku sedih”
3. “Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi”

Kalimat nomor dua sengaja dibuat seperti itu dengan maksud untuk mengecoh mesin. Kata “sayang” biasanya berkonotasi positif. Tapi dalam kalimat tersebut diikuti oleh kata “sedih”.

Demikian juga dengan kalimat nomor 3, “kagum” dan “kecewa” adalah dua kata yang masing-masing berkonotasi positif dan negatif. Jika dilihat dari konteks kalimatnya, jelas kedua kalimat ini bersentimen negatif. Manusia bisa langsung mengenalinya sebagai sentimen negatif.

In [23]:
sentences = ["El Gasing pergi ke Mall Grand Indonesia membeli cilok",
             "Sayang, aku sedih",
             "Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi"]

for text in sentences:
  subwords = tokenizer.encode(text)
  subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)

  logits = model(subwords)[0]
  label = torch.topk(logits, k=1, dim=-1)[1].squeeze().item()

  print(f'Text: {text} | Label : {i2w[label]} ({F.softmax(logits, dim=-1).squeeze()[label] * 100:.3f}%)')

Text: El Gasing pergi ke Mall Grand Indonesia membeli cilok | Label : neutral (99.599%)
Text: Sayang, aku sedih | Label : negative (99.715%)
Text: Merasa kagum dengan toko ini tapi berubah menjadi kecewa setelah transaksi | Label : negative (99.751%)
