In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torchtext==0.6 --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
import torch
from torchtext import data
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score
import csv
import pandas as pd
import pyprind
import time
import json


In [None]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize = 'spacy',
                  tokenizer_language = 'en_core_web_sm')
LABEL = data.LabelField(dtype = torch.float)

fields = [('text', TEXT), ('label', LABEL)]

train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = '/content/drive/MyDrive/Sentiment-Analysis/Data/wo. Neutral Splits',
                                        train = 'TrainFile.csv',
                                        validation = 'DevFile.csv',
                                        test = 'TestFile.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)


In [None]:
# test_data.examples.pop(0)


In [None]:
MAX_VOCAB_SIZE = 25_000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")


Unique tokens in TEXT vocabulary: 25002
Unique tokens in LABEL vocabulary: 2


In [None]:
BATCH_SIZE = 16

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device,
    sort=False)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        embedded = self.dropout(self.embedding(x))

        output, hidden = self.rnn(embedded)

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))

        hidden = self.dropout(hidden)

        out = self.fc(hidden)
        return out

In [None]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
OUTPUT_DIM = 1
DROPOUT = 0.5
model =  RNN(INPUT_DIM,
            EMBEDDING_DIM,
            HIDDEN_DIM,
            OUTPUT_DIM,
            DROPOUT)




In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,529,769 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.00003)

In [None]:
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)


In [None]:
def accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(rounded_preds)
    return acc

In [None]:
def precision(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    precision = precision_score(torch.tensor(labels), rounded_preds, average='macro')
    return precision

In [None]:
def recall(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    recall = recall_score(torch.tensor(labels), rounded_preds,average='macro')
    return recall

In [None]:
def f1(preds, labels):
    rounded_preds = torch.round(torch.sigmoid(torch.tensor(preds)))
    f1 = f1_score(torch.tensor(labels), rounded_preds,average='macro')
    return f1

In [None]:
def test_evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    all_labels = []
    all_text=[]
    df = pd.DataFrame(columns=["RNN-label"])
    model.eval()

    with torch.no_grad():

        for batch in iterator:
            predictions = model(batch.text.to(device)).squeeze()
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            all_predictions.extend(predictions)
            all_labels.extend(batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

        rounded_preds = torch.round(torch.sigmoid(torch.tensor(all_predictions)))

        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", 'r') as json_file:
          df = json.load(json_file)

        for i, key in enumerate(df.keys()):
          df[key]['Rnn'] = int(rounded_preds[i])

        with open("/content/drive/MyDrive/Sentiment-Analysis/Data/evaluation.json", "w") as json_file:
          json.dump(df, json_file)

        total_precision = precision(all_predictions, all_labels)
        total_recall = recall(all_predictions, all_labels)
        total_f1 = f1(all_predictions, all_labels)

    return (epoch_loss / len(iterator),
            epoch_acc / len(iterator),
            total_precision,
            total_recall,
            total_f1)

In [None]:
def train(model, iterator, optimizer, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    bar = pyprind.ProgBar(len(iterator), bar_char='█')
    for batch in iterator:

        optimizer.zero_grad()

        predictions = model(batch.text).squeeze()

        loss = criterion(predictions, batch.label)

        acc = accuracy(predictions, batch.label)

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        bar.update()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        bar = pyprind.ProgBar(len(iterator), bar_char='█')
        for batch in iterator:

            predictions = model(batch.text).squeeze()

            loss = criterion(predictions, batch.label)

            acc = accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            bar.update()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 10

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')


    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Sentiment-Analysis/models/RNN-model.pt'))
test_loss, test_acc, test_prec, test_recall, test_f1 = test_evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}% | Test Prec: {test_prec}%| Test recall: {test_recall:.3f}% |Test f1: {test_f1:.3f}')

Test Loss: 0.266 | Test Acc: 88.98% | Test Prec: 0.7491679656001959%| Test recall: 0.843% |Test f1: 0.784


In [None]:
def merger(model, iterator, criterion):

    epoch_loss = 0
    epoch_acc = 0
    all_predictions = []
    all_labels = []
    all_text=[]
    df = pd.DataFrame(columns=["RNN-label"])
    model.eval()

    with torch.no_grad():

        for batch in iterator:
            predictions = model(batch.text.to(device)).squeeze()
            loss = criterion(predictions, batch.label)
            acc = accuracy(predictions, batch.label)

            all_predictions.extend(predictions)
            all_labels.extend(batch.label)

    readablepred = torch.round(torch.sigmoid(torch.tensor(all_predictions))).tolist()
    actual_labels = [tensor.item() for tensor in all_labels]
    correct = sum(1 for pred, actual in zip(readablepred, actual_labels) if pred == actual)
    all = len(readablepred)
    accuracy = correct / all
    print(actual_labels)
    return readablepred


a=merger(model, test_iterator, criterion)
b=merger(model, test_iterator, criterion)
c=merger(model, test_iterator, criterion)
d=merger(model, test_iterator, criterion)
e=merger(model, test_iterator, criterion)
