In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchtext==0.6

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from torchtext==0.6)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.15.2
    Uninstalling torchtext-0.15.2:
      Successfully uninstalled torchtext-0.15.2
Successfully installed sentencepiece-0.1.99 torchtext-0.6.0


In [None]:
import time
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
from torchtext.data.field import LabelField
from torchtext import data


In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm',
                  include_lengths = True)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = [('text', TEXT), ('label', LABEL)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/drive/MyDrive/Sentiment-Analysis/Data/wo. Neutral Splits',
                                        train = 'TrainFile.csv',
                                        validation = 'DevFile.csv',
                                        test = 'TestFile.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)


In [None]:
# test_data.examples.pop(0)

In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data,
                 max_size = MAX_VOCAB_SIZE,
                 vectors = "glove.6B.100d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:44, 5.24MB/s]                           
100%|█████████▉| 399999/400000 [00:28<00:00, 13981.96it/s]


In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    sort_within_batch = False,
    sort=False,
    device = device,
    )

In [None]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
                 bidirectional, dropout, pad_idx):

        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        self.lstm = nn.LSTM(embedding_dim,
                           hidden_dim,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           dropout=dropout)

        self.fc = nn.Linear(hidden_dim * 2, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):

        #text = [sent len, batch size]

        embedded = self.dropout(self.embedding(text))

        #embedded = [sent len, batch size, emb dim]

        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'),enforce_sorted=False)

        packed_output, (hidden, cell) = self.lstm(packed_embedded)

        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors

        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]


        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))

        #hidden = [batch size, hid dim * num directions]

        return self.fc(hidden)

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

model = LSTM(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            N_LAYERS,
            BIDIRECTIONAL,
            DROPOUT,
            PAD_IDX)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 4,810,857 trainable parameters


In [None]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([25002, 100])


In [None]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0308,  0.1199,  0.5391,  ..., -0.5288,  0.1758,  1.0650],
        ...,
        [ 0.7454, -0.6348, -1.8526,  ..., -2.3333, -0.1176,  0.2177],
        [-0.5968,  1.1433, -0.3188,  ..., -0.7106, -0.6216,  1.0277],
        [ 1.5919, -0.8183, -0.4116,  ..., -1.9719,  1.9827, -0.1093]])

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0308,  0.1199,  0.5391,  ..., -0.5288,  0.1758,  1.0650],
        ...,
        [ 0.7454, -0.6348, -1.8526,  ..., -2.3333, -0.1176,  0.2177],
        [-0.5968,  1.1433, -0.3188,  ..., -0.7106, -0.6216,  1.0277],
        [ 1.5919, -0.8183, -0.4116,  ..., -1.9719,  1.9827, -0.1093]])


In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def precision(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    precision = precision_score(torch.tensor(labels), rounded_preds, average='macro')
    return precision


def recall(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    recall = recall_score(torch.tensor(labels), rounded_preds,average='macro')
    return recall


def f1(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    f1 = f1_score(torch.tensor(labels), rounded_preds,average='macro')
    return f1

def test_evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    all_labels = []

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            all_predictions.extend(predictions)
            all_labels.extend(batch.label)


            epoch_loss += loss.item()
            epoch_acc += acc.item()

        rounded_preds = torch.round(torch.sigmoid(torch.tensor(all_predictions)))
        import json
        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", 'r') as json_file:
          df = json.load(json_file)

        for i, key in enumerate(df.keys()):
          df[key]['LSTM'] = int(rounded_preds[i])


        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", "w") as json_file:
          json.dump(df, json_file)

        total_precision = precision(all_predictions, all_labels)
        total_recall = recall(all_predictions, all_labels)
        total_f1 = f1(all_predictions, all_labels)

    return (epoch_loss / len(iterator),
            epoch_acc / len(iterator),
            total_precision,
            total_recall,
            total_f1)

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in tqdm(iterator):

        optimizer.zero_grad()

        text, text_lengths = batch.text

        predictions = model(text, text_lengths).squeeze(1)

        loss = criterion(predictions, batch.label)

        acc = accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:

            text, text_lengths = batch.text

            predictions = model(text, text_lengths).squeeze(1)

            loss = criterion(predictions, batch.label)

            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

100%|██████████| 2431/2431 [00:48<00:00, 50.34it/s]


Epoch: 01 | Epoch Time: 0m 49s
	Train Loss: 0.084 | Train Acc: 97.00%
	 Val. Loss: 0.058 |  Val. Acc: 97.78%


100%|██████████| 2431/2431 [00:45<00:00, 53.09it/s]


Epoch: 02 | Epoch Time: 0m 47s
	Train Loss: 0.053 | Train Acc: 98.01%
	 Val. Loss: 0.054 |  Val. Acc: 98.21%


100%|██████████| 2431/2431 [00:46<00:00, 51.98it/s]


Epoch: 03 | Epoch Time: 0m 48s
	Train Loss: 0.045 | Train Acc: 98.33%
	 Val. Loss: 0.047 |  Val. Acc: 98.20%


100%|██████████| 2431/2431 [00:45<00:00, 53.62it/s]


Epoch: 04 | Epoch Time: 0m 46s
	Train Loss: 0.039 | Train Acc: 98.49%
	 Val. Loss: 0.044 |  Val. Acc: 98.40%


100%|██████████| 2431/2431 [00:46<00:00, 52.45it/s]


Epoch: 05 | Epoch Time: 0m 47s
	Train Loss: 0.035 | Train Acc: 98.68%
	 Val. Loss: 0.042 |  Val. Acc: 98.55%


100%|██████████| 2431/2431 [00:45<00:00, 53.48it/s]


Epoch: 06 | Epoch Time: 0m 46s
	Train Loss: 0.031 | Train Acc: 98.81%
	 Val. Loss: 0.045 |  Val. Acc: 98.52%


100%|██████████| 2431/2431 [00:46<00:00, 52.52it/s]


Epoch: 07 | Epoch Time: 0m 47s
	Train Loss: 0.028 | Train Acc: 98.91%
	 Val. Loss: 0.046 |  Val. Acc: 98.61%


100%|██████████| 2431/2431 [00:45<00:00, 53.66it/s]


Epoch: 08 | Epoch Time: 0m 46s
	Train Loss: 0.026 | Train Acc: 99.00%
	 Val. Loss: 0.045 |  Val. Acc: 98.45%


100%|██████████| 2431/2431 [00:45<00:00, 53.01it/s]


Epoch: 09 | Epoch Time: 0m 47s
	Train Loss: 0.025 | Train Acc: 99.08%
	 Val. Loss: 0.048 |  Val. Acc: 98.47%


100%|██████████| 2431/2431 [00:45<00:00, 53.94it/s]


Epoch: 10 | Epoch Time: 0m 46s
	Train Loss: 0.023 | Train Acc: 99.14%
	 Val. Loss: 0.046 |  Val. Acc: 98.60%


In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Sentiment-Analysis/models/LSTM-model.pt'))

test_loss, test_acc, test_prec, test_recall, test_f1 = test_evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test Prec: {test_prec}%| Test recall: {test_recall:.3f}% |Test f1: {test_f1:.3f}')

Test Loss: 0.062 | Test Acc: 97.62% | Test Prec: 0.9582932866865888%| Test recall: 0.928% |Test f1: 0.943
