## Natural Language Processing with Transformers (book)

### https://www.oreilly.com/library/view/natural-language-processing/9781098136789/

### Tokenization

In [1]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("I am reading a book now. I love to read books!")
print(tokens)

['i', 'am', 'reading', 'a', 'book', 'now', '.', 'i', 'love', 'to', 'read', 'books', '!']


### Stopword removal

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathias\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import nltk

stop_words = set(stopwords.words('englisch'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

### Sentiment analysis cnn

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchmetrics import Accuracy

class SentimentAnalysisCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(embedding_dim, 2)

    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = self.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.linear(conved)    

In [9]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("I love this book. I do not like")
print(tokens)



['i', 'love', 'this', 'book', '.', 'i', 'do', 'not', 'like']


In [22]:
import torch

# Sample sentences and labels
book_samples = [
    ("The story was captivating and kept me hooked until the end.".split(), 1),
    ("I found the characters shallow and the plot predictable".split(), 0),
    ("An absolute masterpiece with stunning visuals.".split(), 1),
    ("The movie was too slow and quite boring.".split(), 0),
    ("A beautiful portrayal of a complex character.".split(), 1),
    ("The dialogue was unrealistic and forced.".split(), 0),
    ("An inspiring tale of hope and perseverance.".split(), 1),
    ("The plot twists were very predictable.".split(), 0),
    ("Excellent direction and outstanding performances.".split(), 1),
    ("The film was a waste of time and money.".split(), 0),
    ("I loved the cinematography and the soundtrack.".split(), 1),
    ("The acting was subpar and the script was weak.".split(), 0),
    ("A heartwarming story that brought tears to my eyes.".split(), 1),
    ("The pacing was off and the ending was disappointing.".split(), 0),
    ("A brilliant adaptation of the novel.".split(), 1),
    ("The humor fell flat and the characters were annoying.".split(), 0),
    ("An epic journey with breathtaking scenery.".split(), 1),
    ("The plot was convoluted and hard to follow.".split(), 0),
    ("A moving performance by the lead actor.".split(), 1),
    ("The special effects were overdone and distracting.".split(), 0),
    ("I love this movie very much.".split(), 1),
    ("I did not this movie like it.".split(), 0)
]

# Create vocabulary and word-to-index mapping
tokens = set()
for sentence, _ in book_samples:
    tokens.update(sentence)
tokens = list(tokens)

word_to_idx = {word: i for i, word in enumerate(tokens)}
vocab_size = len(tokens)
embedding_dim = 10

# Convert sentences to indices and create tensors
data = [
    ([word_to_idx.get(w, 0) for w in sentence], label)
    for sentence, label in book_samples
]

# Print the generated data
for sample in data:
    print(sample)

([71, 55, 7, 35, 75, 76, 1, 39, 98, 41, 88], 1)
([46, 20, 41, 94, 54, 75, 41, 99, 5], 0)
([67, 85, 65, 2, 47, 44], 1)
([71, 56, 7, 62, 83, 75, 40, 42], 0)
([96, 4, 9, 29, 13, 72, 28], 1)
([71, 12, 7, 70, 75, 82], 0)
([67, 30, 84, 29, 58, 75, 101], 1)
([71, 99, 25, 26, 59, 49], 0)
([86, 10, 75, 68, 93], 1)
([71, 64, 7, 13, 18, 29, 0, 75, 97], 0)
([46, 6, 41, 15, 75, 41, 79], 1)
([71, 27, 7, 45, 75, 41, 8, 7, 19], 0)
([96, 38, 55, 81, 80, 90, 100, 24, 78], 1)
([71, 60, 7, 32, 75, 41, 17, 7, 53], 0)
([96, 77, 23, 29, 41, 22], 1)
([71, 87, 33, 43, 75, 41, 94, 26, 73], 0)
([67, 31, 16, 2, 36, 14], 1)
([71, 99, 7, 74, 75, 11, 100, 34], 0)
([96, 91, 95, 69, 41, 51, 89], 1)
([71, 57, 50, 26, 66, 75, 63], 0)
([46, 21, 3, 56, 59, 48], 1)
([46, 92, 61, 3, 56, 37, 52], 0)


In [23]:
import torch.optim 

sentimentanalysis_model = SentimentAnalysisCNN(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(sentimentanalysis_model.parameters(), lr=0.1)

epochs = 30

for epoch in range(epochs):
    for sentence, label in data:
        sentimentanalysis_model.zero_grad()
        sentence = torch.LongTensor([word_to_idx.get(w, 0) for w in sentence]).unsqueeze(0)
        outputs = sentimentanalysis_model(sentence)
        label = torch.LongTensor([int(label)])
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}, train loss: {loss}")

epoch: 0, train loss: 0.7039518356323242
epoch: 1, train loss: 0.6948510408401489
epoch: 2, train loss: 0.7034589052200317
epoch: 3, train loss: 0.711990475654602
epoch: 4, train loss: 0.71644127368927
epoch: 5, train loss: 0.7185185551643372
epoch: 6, train loss: 0.7192012071609497
epoch: 7, train loss: 0.7194348573684692
epoch: 8, train loss: 0.7194900512695312
epoch: 9, train loss: 0.7194545865058899
epoch: 10, train loss: 0.7194345593452454
epoch: 11, train loss: 0.7195402979850769
epoch: 12, train loss: 0.7195961475372314
epoch: 13, train loss: 0.7196378707885742
epoch: 14, train loss: 0.7196109294891357
epoch: 15, train loss: 0.7196887731552124
epoch: 16, train loss: 0.7194210290908813
epoch: 17, train loss: 0.7196381092071533
epoch: 18, train loss: 0.719740092754364
epoch: 19, train loss: 0.7198074460029602
epoch: 20, train loss: 0.7197638750076294
epoch: 21, train loss: 0.7195072174072266
epoch: 22, train loss: 0.7195144891738892
epoch: 23, train loss: 0.7198120355606079
epoch:

In [None]:
word_to_idx = {word: i for i, word in enumerate(tokens)}
vocab_size = len(tokens)
embedding_dim = 10
book_samples = [
    ("The story was captivating and kept me hooked until the end.".split(), 1),
    ("I found the characters shallow and the plot predictable".split(), 0)
]

In [27]:
book_reviews = [
    "I love this movie".split(),
    "I do not like this movie".split()
]
for review in book_reviews:
    # Convert the review words into tensor form
    input_tensor = torch.LongTensor([word_to_idx.get(w, 0) for w in sentence]).unsqueeze(0)
    #input_tensor = torch.LongTensor([word_to_idx[w] for w in review], dtype=torch.long).unsqueeze(0) 
    # Get the model's output
    outputs = sentimentanalysis_model(input_tensor)
    # Find the index of the most likely sentiment category
    _, predicted_label = torch.max(outputs.data, 1)
    # Convert the predicted label into a sentiment string
    sentiment = "Positive" if predicted_label.item() == 1 else "Negative"
    print(f"Book Review: {' '.join(review)}")
    print(f"Sentiment: {sentiment}\n")

Book Review: I love this movie
Sentiment: Negative

Book Review: I do not like this movie
Sentiment: Negative

