# Using the legacy approach :'(

In [1]:
import torch
import torchtext.legacy
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import spacy

In [2]:
# Train, val and test are all
# {"text": "...", "label": "..."}
# Use the spacy tokenizer
tokenizer=spacy.load('en_core_web_sm')
spacy_tokenizer = lambda x: [tok.text for tok in tokenizer(x)]
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.int8)
text_field = Field(sequential=True, use_vocab=True, tokenize="spacy", lower=True, include_lengths=True, batch_first=True, dtype=torch.int64)




In [3]:
data_field = dict([(None, None), ("text", text_field), ("label", label_field)])
data_field

{None: None,
 'text': <torchtext.legacy.data.field.Field at 0x7feb9e64b4d0>,
 'label': <torchtext.legacy.data.field.Field at 0x7feb9f30df90>}

In [None]:
TabularDataset("/home/c_spino/comp_550/comp-550-project/data/rt-polaritydata/augmentation/validation.json", format="json", fields=data_field)

# Using Dataset and Dataloader

In [79]:
from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from src.utils.json_utils import read_json_lines
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
from collections import Counter
from torchtext.vocab import Vocab
import torch
from torch import nn
import copy

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [75]:
class JsonDataset(Dataset):
    def __init__(self, json_file_path, transform=None):
        json_lines = read_json_lines(json_file_path)
        df = pd.DataFrame(
            {
                "text": [json_dict["text"] for json_dict in json_lines],
                "label": [json_dict["label"] for json_dict in json_lines]
                }
        )
        self.dataset = df
        self.transform = transform
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        sample = {
            "text": self.dataset.iloc[idx]["text"],
            "label": self.dataset.iloc[idx]["label"]
        }

        if self.transform:
            sample = self.transform(sample)
        
        return sample

def yield_tokens(dataset, tokenizer):
    for sample in dataset:
        yield tokenizer(sample["text"])


def build_vocab(dataset, tokenizer, min_freq=1):
    counter = Counter()
    for sample in dataset:
        counter.update(tokenizer(sample["text"]))
    counter = Counter({word: count for word, count in counter.items() if count >= min_freq})
    return Vocab(counter, specials=['<unk>', '<pad>'])


tokenizer = get_tokenizer('spacy', language='en')
val_dataset = JsonDataset("/home/c_spino/comp_550/comp-550-project/data/rt-polaritydata/augmentation/validation.json")
vocab_ = build_vocab_from_iterator(yield_tokens(val_dataset, tokenizer), min_freq=1, specials=["<unk>", "<pad>"])
vocab_.set_default_index(vocab_["<unk>"])


text_pipeline = lambda x: vocab_(tokenizer(x))
label_pipeline = lambda x: int(x)

def collate_batch(batch):
    label_batch, text_batch = [], []
    # Need to pad the text
    for sample in batch:
        _text = sample["text"]
        _label = sample["label"]
        label_batch.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_batch.append(processed_text)
    label_batch = torch.tensor(label_batch, dtype=torch.int64)
    text_batch = pad_sequence(text_batch, batch_first=True, padding_value=vocab_["<pad>"])
    text_batch = torch.tensor(text_batch, dtype=torch.int64)
    return text_batch.to(device), label_batch.to(device)

class RNNClassifier(torch.nn.Module):
    def __init__(
        self, 
        vocab_size,
        model_type="lstm",
        use_pretrained_embedding=False,
        embedding_dim=300,
        hidden_dim=128,
        num_layers=1,
        output_dim=2,
        bidirectional=True,
        dropout=0.5
        ):
        super(RNNClassifier, self).__init__()
        assert (model_type in ["lstm", "gru"]), "rnn_type can be one of: 'lstm', 'gru'."
        rnn_type = nn.LSTM if model_type == "lstm" else nn.GRU
        if not use_pretrained_embedding:
            self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_["<pad>"])
        else:
            pass
        self.bidirectional = bidirectional
        self.rnn = rnn_type(
            input_size=embedding_dim, 
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=bidirectional,
            batch_first=True,
            dropout=dropout
        )
        if bidirectional:
            self.fc = nn.Linear(hidden_dim * 2, output_dim)
        else:
            self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input_batch):
        # Pass the input batch into the embedding layer
        embedded_input = self.embeddings(input_batch)
        # Pass embedded batch into RNN
        _, (hidden, _) = self.rnn(embedded_input)
        # Use the last hidden state of the RNN as the output
        # (might be 2 if use bidirectional)
        if self.bidirectional:
            hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1)
        else:
            hidden_cat = hidden[-1,:,:]
        # Pass the hidden state into the fully connected layer
        output = self.fc(hidden_cat)
        # NOTE: Do NOT pass the output through the softmax layer
        # because CL loss expected un-normalized values
        return output

def train_one_epoch(epoch, model, optimizer, criterion, dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 200

    for idx, (text_batch, label_batch) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text_batch)
        loss = criterion(predicted_label, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label_batch).sum().item()
        total_count += label_batch.size(0)
        if idx % log_interval == 0 and idx > 0:
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| training accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
    return model, loss

def evaluate_one_epoch(epoch, model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for text, label in dataloader:
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    accu_val = total_acc/total_count
    print('-' * 59)
    print('| end of epoch {:3d} valid accuracy {:8.3f} '.format(epoch, accu_val))
    print('-' * 59)
    return accu_val

# Code from https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0
def save_checkpoint(save_path, model, optimizer, valid_loss):
    if save_path == None:
        return

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}

    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):
    if load_path==None:
        return

    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')

    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']

def train(model, optimizer, criterion, train_dataloader, valid_dataloader, save_path, num_epochs=10):
    best_valid_acc = 0.0
    best_model = None
    for epoch in range(num_epochs):
        print(f"Training epoch {epoch}")
        model, loss = train_one_epoch(epoch, model, optimizer, criterion, train_dataloader)
        valid_acc = evaluate_one_epoch(epoch, model, valid_dataloader)
        if valid_acc > best_valid_acc:
            best_model = copy.deepcopy(model)
            best_valid_acc = valid_acc
            save_checkpoint(save_path, best_model, optimizer, loss)




In [83]:
next(iter(DataLoader(val_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)))



(tensor([[4121,  381,    4,  ...,    1,    1,    1],
         [  77,  670,   74,  ...,    1,    1,    1],
         [  45,  458,  322,  ...,    1,    1,    1],
         ...,
         [  20, 1179,  480,  ...,    1,    1,    1],
         [  13,    3,   72,  ...,   12, 4376,    2],
         [  23, 2845, 1759,  ...,    1,    1,    1]]),
 tensor([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
         0, 1, 0, 0, 0, 0, 1, 0]))

In [None]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

In [77]:
model = RNNClassifier(len(vocab_), model_type="lstm")

  "num_layers={}".format(dropout, num_layers))


In [78]:
model.state_dict

<bound method Module.state_dict of RNNClassifier(
  (embeddings): Embedding(5235, 300, padding_idx=1)
  (rnn): LSTM(300, 128, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)>

In [50]:
for i, elt in enumerate(val_iter):
    x_batch, y_batch = elt[0], elt[1]
    print(x_batch)
    print(x_batch.shape)
    print(y_batch)
    print(y_batch.shape)
    break
        

tensor([[  13,  118,    3,  ...,    1,    1,    1],
        [2770,    4,    3,  ...,  299,  183,    2],
        [  48,   10, 3476,  ...,    1,    1,    1],
        ...,
        [   3,   72, 1396,  ...,    1,    1,    1],
        [   5, 1863,  289,  ...,    1,    1,    1],
        [  60,   11,   30,  ...,    1,    1,    1]])
torch.Size([32, 46])
tensor([0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
        0, 1, 1, 0, 0, 1, 1, 0])
torch.Size([32])




In [56]:
emb = nn.Embedding(len(vocab_), 300, padding_idx=vocab_["<pad>"])
x_emb = emb(x_batch)
x_emb.shape

torch.Size([32, 46, 300])

In [57]:
lstm = nn.LSTM(300, 100, num_layers=2, bidirectional=True, batch_first=True)
output, (hidden, cell) = lstm(x_emb)

In [64]:
hidden.shape

torch.Size([4, 32, 100])

In [62]:
hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]),dim=1)

In [63]:
hidden_cat.shape

torch.Size([32, 200])

In [65]:
fc = nn.Linear(200, 2)
output = fc(hidden_cat)
output.shape

torch.Size([32, 2])

In [72]:
softmax = nn.LogSoftmax(dim=1)
output_s = softmax(output)
output_s.shape



torch.Size([32, 2])

In [81]:
loss = nn.CrossEntropyLoss()
loss(output_s, y_batch)

tensor(0.6984, grad_fn=<NllLossBackward0>)

In [10]:
val_dataset = JsonDataset("/home/c_spino/comp_550/comp-550-project/data/rt-polaritydata/augmentation/validation.json")

In [13]:
for elt in val_dataset:
    print(elt)
    break

{'text': 'like a skillful fisher , the director uses the last act to reel in the audience since its poignancy hooks us completely . ', 'label': 1}


In [11]:
tokenizer = get_tokenizer("spacy")



In [25]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')

tokenizer = get_tokenizer('basic_english')
train_iter = AG_NEWS(split='train')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

29.5MB [00:02, 10.8MB/s]


In [26]:
train_iter

<torchtext.data.datasets_utils._RawTextIterableDataset at 0x7feb9d9df390>