In [None]:
!pip install torchtext==0.6

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import json
import torch
import torch.optim as optim

from torchtext.data.field import LabelField
from torchtext import data
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('text', TEXT), ('label', LABEL)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/drive/MyDrive/Sentiment-Analysis/Data/wo. Neutral Splits',
                                        train = 'TrainFile.csv',
                                        validation = 'DevFile.csv',
                                        test = 'long-review-test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)

In [None]:
# test_data.examples.pop(0)

<torchtext.data.example.Example at 0x7f595c93c640>

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)


In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = False,
    sort=False,
    device = device,
    )

In [None]:
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)

        self.rnn = nn.GRU(embedding_dim,
                          hidden_dim,
                          num_layers=n_layers,
                          bidirectional=bidirectional,
                          dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):

        embedded = self.dropout(self.embedding(text))

        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)

        packed_output, hidden = self.rnn(packed_embedded)

        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))

        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = GRUModel(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,233,321 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

model.embedding.weight.data.copy_(pretrained_embeddings)

torch.Size([25002, 100])


tensor([[ 1.6506,  0.0865,  1.3287,  ...,  0.5458,  0.8515,  0.1580],
        [-0.8772,  2.1614,  0.7010,  ...,  0.1617, -0.4898,  0.3106],
        [-0.0308,  0.1199,  0.5391,  ..., -0.5288,  0.1758,  1.0650],
        ...,
        [-0.9083,  0.1065, -0.5070,  ...,  0.1728, -1.0311,  1.2032],
        [ 1.8723,  0.1652, -0.3859,  ..., -0.8991, -0.3518,  0.8838],
        [-1.6545, -0.3179, -1.1832,  ..., -0.0497, -0.8147, -2.2842]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0308,  0.1199,  0.5391,  ..., -0.5288,  0.1758,  1.0650],
        ...,
        [-0.9083,  0.1065, -0.5070,  ...,  0.1728, -1.0311,  1.2032],
        [ 1.8723,  0.1652, -0.3859,  ..., -0.8991, -0.3518,  0.8838],
        [-1.6545, -0.3179, -1.1832,  ..., -0.0497, -0.8147, -2.2842]])


In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()

        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label.float())  # Convert label to float for binary cross-entropy

        acc = accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10
from tqdm import tqdm

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 2431/2431 [00:41<00:00, 58.92it/s]


Epoch: 01 | Epoch Time: 0m 42s
	Train Loss: 0.083 | Train Acc: 97.04%
	 Val. Loss: 0.050 |  Val. Acc: 98.13%


100%|██████████| 2431/2431 [00:40<00:00, 60.58it/s]


Epoch: 02 | Epoch Time: 0m 41s
	Train Loss: 0.054 | Train Acc: 98.01%
	 Val. Loss: 0.055 |  Val. Acc: 98.02%


100%|██████████| 2431/2431 [00:40<00:00, 60.37it/s]


Epoch: 03 | Epoch Time: 0m 41s
	Train Loss: 0.046 | Train Acc: 98.27%
	 Val. Loss: 0.048 |  Val. Acc: 98.21%


100%|██████████| 2431/2431 [00:40<00:00, 60.66it/s]


Epoch: 04 | Epoch Time: 0m 41s
	Train Loss: 0.041 | Train Acc: 98.45%
	 Val. Loss: 0.049 |  Val. Acc: 98.27%


100%|██████████| 2431/2431 [00:40<00:00, 60.65it/s]


Epoch: 05 | Epoch Time: 0m 41s
	Train Loss: 0.038 | Train Acc: 98.58%
	 Val. Loss: 0.044 |  Val. Acc: 98.47%


100%|██████████| 2431/2431 [00:40<00:00, 60.73it/s]


Epoch: 06 | Epoch Time: 0m 41s
	Train Loss: 0.034 | Train Acc: 98.70%
	 Val. Loss: 0.046 |  Val. Acc: 98.50%


100%|██████████| 2431/2431 [00:40<00:00, 60.65it/s]


Epoch: 07 | Epoch Time: 0m 41s
	Train Loss: 0.032 | Train Acc: 98.78%
	 Val. Loss: 0.046 |  Val. Acc: 98.33%


100%|██████████| 2431/2431 [00:40<00:00, 59.93it/s]


Epoch: 08 | Epoch Time: 0m 41s
	Train Loss: 0.030 | Train Acc: 98.86%
	 Val. Loss: 0.046 |  Val. Acc: 98.44%


100%|██████████| 2431/2431 [00:40<00:00, 60.25it/s]


Epoch: 09 | Epoch Time: 0m 41s
	Train Loss: 0.028 | Train Acc: 98.95%
	 Val. Loss: 0.047 |  Val. Acc: 98.47%


100%|██████████| 2431/2431 [00:40<00:00, 60.14it/s]


Epoch: 10 | Epoch Time: 0m 42s
	Train Loss: 0.027 | Train Acc: 99.01%
	 Val. Loss: 0.049 |  Val. Acc: 98.40%


In [None]:
def precision(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    precision = precision_score(torch.tensor(labels), rounded_preds, average='weighted')
    return precision


def recall(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    recall = recall_score(torch.tensor(labels), rounded_preds, average='weighted')
    return recall


def f1(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    f1 = f1_score(torch.tensor(labels), rounded_preds, average='weighted')
    return f1

def test_evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    all_labels = []

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            all_predictions.extend(predictions)
            all_labels.extend(batch.label)


            epoch_loss += loss.item()
            epoch_acc += acc.item()

        rounded_preds = torch.round(torch.sigmoid(torch.tensor(all_predictions)))

        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", 'r') as json_file:
          df = json.load(json_file)

        for i, key in enumerate(df.keys()):
          df[key]['GRU'] = int(rounded_preds[i])

        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", "w") as json_file:
          json.dump(df, json_file)

        total_precision = precision(all_predictions, all_labels)
        total_recall = recall(all_predictions, all_labels)
        total_f1 = f1(all_predictions, all_labels)

    return (epoch_loss / len(iterator),
            epoch_acc / len(iterator),
            total_precision,
            total_recall,
            total_f1)

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Sentiment-Analysis/models/GRU-model.pt'))

test_loss, test_acc , prec, recall, f1= test_evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%', prec, recall, f1)

Test Loss: 0.251 | Test Acc: 91.08% 0.9072645328851882 0.9056603773584906 0.9064086096596791
