In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

torch.manual_seed(1)

import torch
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
import transformers
from transformers import *
import numpy as np
from datasets import load_dataset
import evaluate
from tqdm import tqdm
import pandas as pd
from torch.utils.data import DataLoader, random_split, Dataset
from torch.utils.data import Subset
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
corpus = api.load('text8')
gensim_model = Word2Vec(corpus)

In [5]:
review_dataset = load_dataset("csv", data_files="data/train_data.csv", split="train")
print(review_dataset)

Dataset({
    features: ['review', 'rating'],
    num_rows: 16392
})


In [54]:
tokenizer = gensim_model.wv.key_to_index

def tokenize_function(examples):
    review_tokenized = []
    all_parsed = 0
    unknows = 0
    for word in examples.split():
        all_parsed+=1
        try:
            review_tokenized.append(tokenizer[word.lower()])
        except:
            unknows +=1
#     print(unknows/all_parsed)
    return review_tokenized

In [55]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        data['review'] = torch.tensor(tokenize_function(data['review']))
        return data
        # words_to_delete = int(len(data['input_ids']) * 0.99)
        # text_indecies, _ = random_split(
        #     range(len(data['input_ids'])), [words_to_delete, len(data['input_ids']) - words_to_delete]
        # )
        # return {'rating': data['rating'], 'input_ids': data['input_ids'][text_indecies], 'attention_mask': data['attention_mask'][text_indecies]}

In [89]:
# tokenized_datasets = review_dataset.map(tokenize_function, batched=True)
# tokenized_datasets = tokenized_datasets.remove_columns(["review"])
# tokenized_datasets.set_format("torch")

torch.manual_seed(73)
n_train_examples = int(len(review_dataset) * 0.9)
train_indices, validation_indices = random_split(
    range(len(review_dataset)), [n_train_examples, len(review_dataset) - n_train_examples]
)
review_dataset.set_format("torch")

validation_dataset = CustomDataset(Subset(review_dataset, validation_indices))
train_dataset = CustomDataset(Subset(review_dataset, train_indices))

In [84]:
from torch.nn.utils.rnn import pad_sequence
def pad_collate(batch):
    xx = []
    yy = []
    for pack in batch:
        xx.append(pack['review'])
        yy.append(pack['rating'])
    x_lens = [len(x)-1 for x in xx]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)
    yy = torch.stack(yy)
    return xx_pad, yy, x_lens

In [103]:
review_train_loader = DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn=pad_collate)
validation_dataloader = DataLoader(validation_dataset, batch_size=16, collate_fn=pad_collate)

In [111]:
emb_weights = torch.FloatTensor(gensim_model.wv.vectors)

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTMRegressor(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, emb_weights, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.embeddings = nn.Embedding.from_pretrained(emb_weights)
        self.embeddings.requires_grad = False
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, batch_first=False)
        self.fc = nn.Linear(hidden_size*self.bidirectional, out_size)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, len_x, hidden):
        x = self.embeddings(x)
        x = torch.transpose(x,0,1)
        all_outputs, hidden = self.lstm(x, hidden)
        all_outputs = torch.transpose(all_outputs,0,1)
        last_seq_items = all_outputs[range(all_outputs.shape[0]), len_x]
        out = last_seq_items # torch.flatten(all_outputs,1)
        x = self.fc(out)
        return x, hidden

lstm_model = LSTMRegressor(100, 500, 2, 5, emb_weights).to(device)
lstm_model

LSTMRegressor(
  (embeddings): Embedding(71290, 100)
  (lstm): LSTM(100, 500, num_layers=2)
  (fc): Linear(in_features=500, out_features=5, bias=True)
)

In [112]:
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = 0.001)
loss_fun = nn.CrossEntropyLoss()
lstm_model.train()

# Training loop
for epoch in range(21):
    losses = 0
    batches = 0
    for x, targets, len_x in tqdm(review_train_loader):
        x = x.to(device)
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        optimizer.zero_grad()
        loss = loss_fun(preds, targets)
        loss.backward()
        optimizer.step()
        losses += loss.item()
        batches +=1
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {losses/batches:.3}")

100%|██████████| 231/231 [01:00<00:00,  3.80it/s]


Epoch: 0, loss: 1.32


100%|██████████| 231/231 [01:02<00:00,  3.70it/s]
100%|██████████| 231/231 [01:03<00:00,  3.66it/s]
100%|██████████| 231/231 [01:04<00:00,  3.58it/s]
100%|██████████| 231/231 [01:04<00:00,  3.57it/s]
100%|██████████| 231/231 [01:05<00:00,  3.50it/s]
100%|██████████| 231/231 [01:06<00:00,  3.47it/s]
100%|██████████| 231/231 [01:04<00:00,  3.56it/s]
100%|██████████| 231/231 [01:05<00:00,  3.53it/s]
100%|██████████| 231/231 [01:05<00:00,  3.52it/s]
100%|██████████| 231/231 [01:06<00:00,  3.48it/s]


Epoch: 10, loss: 0.565


100%|██████████| 231/231 [01:06<00:00,  3.50it/s]
100%|██████████| 231/231 [01:06<00:00,  3.47it/s]
100%|██████████| 231/231 [01:05<00:00,  3.50it/s]
100%|██████████| 231/231 [01:05<00:00,  3.52it/s]
100%|██████████| 231/231 [01:06<00:00,  3.46it/s]
100%|██████████| 231/231 [01:05<00:00,  3.51it/s]
100%|██████████| 231/231 [01:05<00:00,  3.52it/s]
100%|██████████| 231/231 [01:06<00:00,  3.49it/s]
100%|██████████| 231/231 [01:05<00:00,  3.52it/s]
100%|██████████| 231/231 [01:11<00:00,  3.25it/s]

Epoch: 20, loss: 0.0413





In [113]:
metric = evaluate.load("accuracy")
lstm_model.eval()
with torch.no_grad():
    preds_list = []
    targets_list = []
    for x, targets, len_x in validation_dataloader:
        x = x.to(device)
        targets_list.append(targets.numpy())
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        metric.add_batch(predictions=torch.argmax(preds, dim=-1), references=targets)

metric.compute()


{'accuracy': 0.5469512195121952}

lstm_model = LSTMRegressor(100, 100, 1, 5, emb_weights).to(device)
 Po 50 epokach accuracy 54%


LSTMRegressor(100, 500, 2, 5, emb_weights).to(device)
 Po 20 epokach 54,6%
