## Potrzebne importy

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
import os
import json
import math
import numpy as np
import pandas as pd

## Imports for plotting
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
from matplotlib.colors import to_rgb
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.reset_orig()
sns.set()


## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
# Torchvision
import torchvision
from torchvision.utils import save_image
from torchvision.datasets import FashionMNIST
from torchvision.transforms import v2
from torchvision.datasets import ImageFolder
from torch.nn.functional import one_hot

from pytorch_fid import fid_score

  set_matplotlib_formats('svg', 'pdf') # For export


In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cpu


## Wczytywanie danych

In [7]:
reviews = pd.read_csv("train_data.csv")
reviews.head()

Unnamed: 0,review,rating
0,location not palace excellent hotel booke dthe...,4
1,respite definitely not place stay looking ultr...,3
2,stunning truly memorable spot right beach nusa...,4
3,solid business hotel near embassy stayed hotel...,3
4,nice place make sure lock money warning money ...,3


In [8]:
np.max(reviews['rating'])

4

## Model Rekurencyjny

In [10]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTMRegressor(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, emb_weights, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.embeddings = nn.Embedding.from_pretrained(emb_weights)
        self.embeddings.requires_grad = False
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, batch_first=False)
        self.fc = nn.Linear(hidden_size*self.bidirectional, out_size)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, len_x, hidden):
        x = self.embeddings(x)
        x = torch.transpose(x,0,1)
        all_outputs, hidden = self.lstm(x, hidden)
        all_outputs = torch.transpose(all_outputs,0,1)
        last_seq_items = all_outputs[range(all_outputs.shape[0]), len_x]
        out = last_seq_items
        x = self.fc(out)
        return x, hidden

## Model używający ręcznie wytworzonych embbedingów z tekstów recenzji 

In [11]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10
vocab = set()

In [12]:
for review in reviews['review']:
    ngrams = [
    (
        [review[i - j - 1] for j in range(CONTEXT_SIZE)] + [review[i+  j + 1] for j in range(CONTEXT_SIZE)],
        review[i]
    )
    for i in range(CONTEXT_SIZE, len(review)-CONTEXT_SIZE)
    ]
    vocab.add(review)

In [None]:

word_to_ix = {word: i for i, word in enumerate(vocab)}

LSTMRegressor(
  (embeddings): Embedding(71290, 100)
  (lstm): LSTM(100, 100)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)

In [None]:
class NGramLanguageModeler(nn.Module):

    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2* context_size * embedding_dim, 128)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        log_probs = F.log_softmax(out, dim=1)
        return log_probs

## Model używający przetrenowanych embeddingów

In [7]:
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
corpus = api.load('text8')
gensim_model = Word2Vec(corpus)


In [7]:
emb_weights = torch.FloatTensor(gensim_model.wv.vectors)
embedding = nn.Embedding.from_pretrained(emb_weights)
tokenizer = gensim_model.wv.key_to_index

In [8]:
tokenizer["aa"]

5636

In [9]:
clean_train_reviews_tokenized = []
for review in reviews['review']:
    unknows = 0
    all_parsed = 0
    review_tokenized = []
    for word in review.split():
        all_parsed+=1
        try:
            review_tokenized.append(tokenizer[word.lower()])
        except:
            unknows +=1
#     print(unknows/all_parsed)
    clean_train_reviews_tokenized.append(review_tokenized)

In [None]:
clean_train_reviews_tokenized

In [11]:
class ReviewDataset(data.Dataset):
    def __init__(self, data,labels):
        self.data = []
        for d, l in zip(data,labels):
            self.data.append((torch.from_numpy(np.array(d)).long(),torch.tensor(l).long()))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        in_data, target = self.data[idx]
        return in_data, target

In [12]:
train_indices = np.random.rand(len(clean_train_reviews_tokenized))>0.3
train_indices

array([ True,  True,  True, ..., False,  True, False])

In [13]:
train_data = ReviewDataset(np.array(clean_train_reviews_tokenized, dtype=object)[train_indices],reviews["rating"].values[train_indices])
test_data = ReviewDataset(np.array(clean_train_reviews_tokenized, dtype=object)[~train_indices],reviews["rating"].values[~train_indices])

In [14]:
from torch.nn.utils.rnn import pad_sequence
def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x)-1 for x in xx]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=1)
    yy = torch.stack(yy)
    return xx_pad, yy, x_lens

In [25]:
train_loader = data.DataLoader(train_data, batch_size=32, collate_fn=pad_collate, shuffle=True,drop_last=True)
test_loader = data.DataLoader(test_data, batch_size=32, collate_fn=pad_collate, shuffle=False)

In [None]:
next(iter(train_loader))

In [17]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
class LSTMRegressor(nn.Module):

    def __init__(self, input_size, hidden_size, num_layers, out_size, emb_weights, bidirectional = False):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        if bidirectional:
            self.bidirectional = 2
        else:
            self.bidirectional = 1
        self.embeddings = nn.Embedding.from_pretrained(emb_weights)
        self.embeddings.requires_grad = False
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, bidirectional=bidirectional, batch_first=False)
        self.fc = nn.Linear(hidden_size*self.bidirectional, out_size)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers*self.bidirectional , batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, len_x, hidden):
        x = self.embeddings(x)
        x = torch.transpose(x,0,1)
        all_outputs, hidden = self.lstm(x, hidden)
        all_outputs = torch.transpose(all_outputs,0,1)
        last_seq_items = all_outputs[range(all_outputs.shape[0]), len_x]
        out = last_seq_items
        x = self.fc(out)
        return x, hidden

lstm_model = LSTMRegressor(100, 100, 1, 5, emb_weights).to(device)
lstm_model

LSTMRegressor(
  (embeddings): Embedding(71290, 100)
  (lstm): LSTM(100, 100)
  (fc): Linear(in_features=100, out_features=5, bias=True)
)

In [18]:
optimizer = torch.optim.Adam(lstm_model.parameters(), lr = 0.001)
loss_fun = nn.CrossEntropyLoss()
lstm_model.train()

# Training loop
for epoch in range(31):
    losses = 0
    batches = 0
    for x, targets, len_x in train_loader:
        x = x.to(device)
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        optimizer.zero_grad()
        loss = loss_fun(preds, targets)
        loss.backward()
        optimizer.step()
        losses += loss.item()
        batches +=1
    if epoch % 10 == 0:
        print(f"Epoch: {epoch}, loss: {losses/batches:.3}")

KeyboardInterrupt: 

In [83]:
lstm_model.eval()
with torch.no_grad():
    preds_list = []
    targets_list = []
    for x, targets, len_x in test_loader:
        x = x.to(device)
        targets_list.append(targets.numpy())
        targets = targets.to(device)
        hidden, state = lstm_model.init_hidden(x.size(0))
        hidden, state = hidden.to(device), state.to(device)
        preds, _ = lstm_model(x, len_x, (hidden,state))
        preds = preds.squeeze(1)
        preds_list.append(preds.cpu().numpy())

In [85]:
print(f"Test accuracy: {(np.argmax((np.concatenate(preds_list)),1) == np.concatenate(targets_list)).sum()/len(np.concatenate(targets_list)):.3}")

Test accuracy: 0.539


In [97]:
example_1_text = "I do not like this hotel"
example_2_text = "I like this hotel"
example_1_tokenized = []
for word in example_1_text.split():
    try:
        example_1_tokenized.append(tokenizer[word])
    except:
        continue
example_2_tokenized = []
for word in example_2_text.split():
    try:
        example_2_tokenized.append(tokenizer[word])
    except:
        continue
hidden, state = lstm_model.init_hidden(1)
hidden, state = hidden.to(device), state.to(device)
preds_1,_ = lstm_model(torch.from_numpy(np.array(example_1_tokenized)).unsqueeze(0).to(device),len(example_1_tokenized)-1,(hidden,state))
preds_2,_ = lstm_model(torch.from_numpy(np.array(example_2_tokenized)).unsqueeze(0).to(device),len(example_2_tokenized)-1,(hidden,state))

In [98]:
print(np.argmax(preds_1.detach().numpy()))
print(np.argmax(preds_2.detach().numpy()))

3
3
