In [1]:
!nvidia-smi

Mon May 16 10:47:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    44W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install --upgrade -q matplotlib
!pip install -q transformers

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

#transformers
from transformers import BertTokenizer
from transformers import BertModel,BertForSequenceClassification

import torch
from torch.utils.data import Dataset, DataLoader

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
seed=42
sns.set_style("whitegrid")
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Setiment_Analysis/en/dataset/covid-19/clean_train.csv', header=0)
df.head()

Unnamed: 0,text_clean,Sentiment
0,it s a confusing odd time for the shopping pub...,Negative
1,in 2019 d2c ecommerce sales reached 1428 billi...,Positive
2,chinese residents are paying exorbitant prices...,Negative
3,list of supermarkets grocery shops and vegetab...,Neutral
4,its there any wonder tesco and other supermark...,Positive


In [6]:
df_test = pd.read_csv('/content/drive/MyDrive/NLP/Setiment_Analysis/en/dataset/covid-19/clean_test.csv', header=0)
df_test.head()

Unnamed: 0,text_clean,Sentiment
0,we may not have any toilet paper in our house ...,Positive
1,really whats the downside of coronavirus for a...,Positive
2,hello everyone we made amp sell high quality m...,Extremely Positive
3,happy to report that i jumped on the panic sho...,Positive
4,just been to the supermarket why do all women ...,Neutral


# Sentiment column ananlysis

In [7]:
df['Sentiment'].value_counts()

Positive              11381
Negative               9889
Neutral                7560
Extremely Positive     6618
Extremely Negative     5475
Name: Sentiment, dtype: int64

In [8]:
df['Sentiment'] = df['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1,
                                       'Positive': 2, 'Extremely Positive': 2})

In [9]:
df_test['Sentiment'] = df_test['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1,
                                                 'Positive': 2, 'Extremely Positive': 2})

In [10]:
df['Sentiment'].value_counts()

2    17999
0    15364
1     7560
Name: Sentiment, dtype: int64

## Class Balanceing by RandomOverSampler

In [11]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns=['text_clean', 'Sentiment'])

In [12]:
train_os['Sentiment'].value_counts()

0    17999
2    17999
1    17999
Name: Sentiment, dtype: int64

## Train - Validation - Test split

In [13]:
X = train_os['text_clean'].values
y = train_os['Sentiment'].values

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=seed)

In [15]:
X_test = df_test['text_clean'].values
y_test = df_test['Sentiment'].values

## One hot encoding

In [16]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_val.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 48597
VALIDATION DATA: 5400
TESTING DATA: 3787


# BERT Sentiment analysis

In [17]:
MAX_LEN=128

In [18]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [19]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, X, y, max_len):
        self.tokenizer = tokenizer
        self.inputs = [self.tokenizer(text, padding='max_length', max_length = max_len,
                                      truncation=True, return_tensors='pt') for text in X]
        self.labels = y
        self.max_len = max_len
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        input = self.inputs[idx]
        label = self.labels[idx]
        return input, label

In [20]:
train_dataset = TweetDataset(tokenizer, X_train, y_train, MAX_LEN)
val_dataset = TweetDataset(tokenizer, X_val, y_val, MAX_LEN)
test_dataset = TweetDataset(tokenizer, X_test, y_test, MAX_LEN)

In [21]:
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=128, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=128, shuffle=True)

# BERT Modeling

In [22]:
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
bert_model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
from torch.optim import Adam
from tqdm.notebook import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs, device='cuda'):
    optimizer = Adam(model.parameters(), lr=learning_rate)

    model = model.to(device)
    
    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.to(device)
            input_ids = train_input['input_ids'].squeeze().to(device)
            attention_mask = train_input['attention_mask'].squeeze().to(device)

            output = model(input_ids, attention_mask, labels=train_label)

            loss = output.loss
            logits = output.logits
            total_loss_train += loss.item()

            acc = (logits.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0


        with torch.no_grad():
            for val_input, val_label in val_dataloader:
                val_label = val_label.to(device)
                input_ids = val_input['input_ids'].squeeze().to(device)
                attention_mask = val_input['attention_mask'].squeeze().to(device)

                output = model(input_ids, attention_mask, labels=val_label)

                loss = output.loss
                logits = output.logits
                total_loss_val += loss.item()

                acc = (logits.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / (128*len(train_dataloader)): .3f} \
            | Train Accuracy: {total_acc_train / (128*len(train_dataloader)): .3f} \
            | Val Loss: {total_loss_val / (128*len(val_dataloader)): .3f} \
            | Val Accuracy: {total_acc_val / (128*len(val_dataloader)): .3f}')


EPOCHS = 19
LR = 1e-6
              
train(bert_model, train_dataloader, val_dataloader, LR, EPOCHS)

  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 1 | Train Loss:  0.008             | Train Accuracy:  0.515             | Val Loss:  0.007             | Val Accuracy:  0.592


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 2 | Train Loss:  0.006             | Train Accuracy:  0.657             | Val Loss:  0.006             | Val Accuracy:  0.678


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 3 | Train Loss:  0.005             | Train Accuracy:  0.729             | Val Loss:  0.005             | Val Accuracy:  0.725


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 4 | Train Loss:  0.005             | Train Accuracy:  0.775             | Val Loss:  0.005             | Val Accuracy:  0.760


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 5 | Train Loss:  0.004             | Train Accuracy:  0.807             | Val Loss:  0.004             | Val Accuracy:  0.780


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 6 | Train Loss:  0.004             | Train Accuracy:  0.830             | Val Loss:  0.004             | Val Accuracy:  0.793


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 7 | Train Loss:  0.003             | Train Accuracy:  0.849             | Val Loss:  0.004             | Val Accuracy:  0.805


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 8 | Train Loss:  0.003             | Train Accuracy:  0.864             | Val Loss:  0.004             | Val Accuracy:  0.815


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 9 | Train Loss:  0.003             | Train Accuracy:  0.878             | Val Loss:  0.003             | Val Accuracy:  0.821


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 10 | Train Loss:  0.003             | Train Accuracy:  0.892             | Val Loss:  0.003             | Val Accuracy:  0.830


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 11 | Train Loss:  0.002             | Train Accuracy:  0.906             | Val Loss:  0.003             | Val Accuracy:  0.840


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 12 | Train Loss:  0.002             | Train Accuracy:  0.918             | Val Loss:  0.003             | Val Accuracy:  0.848


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 13 | Train Loss:  0.002             | Train Accuracy:  0.929             | Val Loss:  0.003             | Val Accuracy:  0.851


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 14 | Train Loss:  0.002             | Train Accuracy:  0.940             | Val Loss:  0.003             | Val Accuracy:  0.858


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 15 | Train Loss:  0.001             | Train Accuracy:  0.950             | Val Loss:  0.003             | Val Accuracy:  0.861


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 16 | Train Loss:  0.001             | Train Accuracy:  0.958             | Val Loss:  0.003             | Val Accuracy:  0.864


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 17 | Train Loss:  0.001             | Train Accuracy:  0.965             | Val Loss:  0.003             | Val Accuracy:  0.870


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 18 | Train Loss:  0.001             | Train Accuracy:  0.971             | Val Loss:  0.003             | Val Accuracy:  0.875


  0%|          | 0/380 [00:00<?, ?it/s]

Epochs: 19 | Train Loss:  0.001             | Train Accuracy:  0.976             | Val Loss:  0.003             | Val Accuracy:  0.877


# BERT result

In [24]:
def evaluate(model, test_dataloader):
    device = 'cuda'
    model = model.to(device)
    total_acc_test = 0

    with torch.no_grad():
        for test_input, test_label in test_dataloader:
            test_label = test_label.to(device)
            input_ids = test_input['input_ids'].squeeze().to(device)
            attention_mask = test_input['attention_mask'].squeeze().to(device)

            output = model(input_ids, attention_mask, labels=test_label)
            logits = output.logits

            acc = (logits.argmax(dim=1) == test_label).sum().item()
            print(acc)
            total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test/ (len(test_dataloader)*128): .3f}')

evaluate(bert_model, test_dataloader)

106
111
102
102
104
110
107
115
96
113
107
102
100
107
99
105
97
100
108
111
106
108
102
108
111
106
107
105
95
57
30
Test Accuracy:  0.809
