In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import DataLoader, random_split, Dataset
from torch.utils.data import Subset
import random

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
corpus = api.load('text8')
gensim_model = Word2Vec(corpus)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        return data
        # words_to_delete = int(len(data['input_ids']) * 0.99)
        # text_indecies, _ = random_split(
        #     range(len(data['input_ids'])), [words_to_delete, len(data['input_ids']) - words_to_delete]
        # )
        # return {'rating': data['rating'], 'input_ids': data['input_ids'][text_indecies], 'attention_mask': data['attention_mask'][text_indecies]}

In [None]:
review_dataset = load_dataset("csv", data_files="data/train_data.csv", split="train")
print(review_dataset)

In [None]:
tokenizer = gensim_model.wv.key_to_index

def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

In [None]:
tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["review"])
tokenized_datasets.set_format("torch")

torch.manual_seed(73)
n_train_examples = int(len(tokenized_datasets) * 0.9)
train_indices, validation_indices = random_split(
    range(len(tokenized_datasets)), [n_train_examples, len(tokenized_datasets) - n_train_examples]
)

validation_dataset = Subset(tokenized_datasets, validation_indices)
train_dataset = CustomDataset(Subset(tokenized_datasets, train_indices))

In [None]:
review_train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
validation_dataloader = DataLoader(validation_dataset, batch_size=16)

In [None]:
emb_weights = torch.FloatTensor(gensim_model.wv.vectors)

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTMRegressor(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, emb_weights, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.embeddings = nn.Embedding.from_pretrained(emb_weights)
        self.embeddings.requires_grad = False
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, batch_first=False)
        self.fc = nn.Linear(hidden_size*self.bidirectional, out_size)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, len_x, hidden):
        x = self.embeddings(x)
        x = torch.transpose(x,0,1)
        all_outputs, hidden = self.lstm(x, hidden)
        all_outputs = torch.transpose(all_outputs,0,1)
        last_seq_items = all_outputs[range(all_outputs.shape[0]), len_x]
        out = last_seq_items # torch.flatten(all_outputs,1)
        x = self.fc(out)
        return x, hidden

lstm_model = LSTMRegressor(100, 100, 1, 2, emb_weights).to(device)
lstm_model

In [None]:
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = 0.001)
loss_fun = nn.CrossEntropyLoss()
lstm_model.train()

# Training loop
for epoch in range(31):
    losses = 0
    batches = 0
    for x, targets, len_x in tqdm(review_train_loader):
        x = x.to(device)
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        optimizer.zero_grad()
        loss = loss_fun(preds, targets)
        loss.backward()
        optimizer.step()
        losses += loss.item()
        batches +=1
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {losses/batches:.3}")