## Natural Language Processing with Transformers (book)

### https://www.oreilly.com/library/view/natural-language-processing/9781098136789/

### https://campus.datacamp.com/courses/deep-learning-for-text-with-pytorch/advanced-topics-in-deep-learning-for-text-with-pytorch?ex=4

### Tokenization

In [None]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("I am reading a book now. I love to read books!")
print(tokens)

['i', 'am', 'reading', 'a', 'book', 'now', '.', 'i', 'love', 'to', 'read', 'books', '!']


### Stopword removal

In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

ModuleNotFoundError: No module named 'nltk'

In [1]:
import nltk

stop_words = set(stopwords.words('englisch'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

NameError: name 'stopwords' is not defined

### Sentiment analysis cnn

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchmetrics import Accuracy

class SentimentAnalysisCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv = nn.Conv1d(embedding_dim, embedding_dim, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.linear = nn.Linear(embedding_dim, 2)

    def forward(self, text):
        embedded = self.embedding(text).permute(0, 2, 1)
        conved = self.relu(self.conv(embedded))
        conved = conved.mean(dim=2)
        return self.linear(conved)    

In [5]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("basic_english")
tokens = tokenizer("I love this book. I do not like")
print(tokens)

['i', 'love', 'this', 'book', '.', 'i', 'do', 'not', 'like']




In [7]:
import torch

# Sample sentences and labels
book_samples = [
    ("The story was captivating and kept me hooked until the end.".split(), 1),
    ("I found the characters shallow and the plot predictable".split(), 0),
    ("An absolute masterpiece with stunning visuals.".split(), 1),
    ("The movie was too slow and quite boring.".split(), 0),
    ("A beautiful portrayal of a complex character.".split(), 1),
    ("The dialogue was unrealistic and forced.".split(), 0),
    ("An inspiring tale of hope and perseverance.".split(), 1),
    ("The plot twists were very predictable.".split(), 0),
    ("Excellent direction and outstanding performances.".split(), 1),
    ("The film was a waste of time and money.".split(), 0),
    ("I loved the cinematography and the soundtrack.".split(), 1),
    ("The acting was subpar and the script was weak.".split(), 0),
    ("A heartwarming story that brought tears to my eyes.".split(), 1),
    ("The pacing was off and the ending was disappointing.".split(), 0),
    ("A brilliant adaptation of the novel.".split(), 1),
    ("The humor fell flat and the characters were annoying.".split(), 0),
    ("An epic journey with breathtaking scenery.".split(), 1),
    ("The plot was convoluted and hard to follow.".split(), 0),
    ("A moving performance by the lead actor.".split(), 1),
    ("The special effects were overdone and distracting.".split(), 0),
    ("I love this movie very much.".split(), 1),
    ("I did not this movie like it.".split(), 0)
]

# Create vocabulary and word-to-index mapping
tokens = set()
for sentence, _ in book_samples:
    tokens.update(sentence)
tokens = list(tokens)

word_to_idx = {word: i for i, word in enumerate(tokens)}
vocab_size = len(tokens)
embedding_dim = 10

# Convert sentences to indices and create tensors
data = [
    ([word_to_idx.get(w, 0) for w in sentence], label)
    for sentence, label in book_samples
]

# Print the generated data
for sample in data:
    print(sample)

([86, 99, 96, 49, 62, 78, 39, 93, 101, 65, 88], 1)
([72, 2, 65, 69, 11, 62, 65, 33, 3], 0)
([18, 38, 57, 15, 12, 4], 1)
([86, 74, 96, 67, 45, 62, 58, 50], 0)
([34, 92, 28, 64, 97, 25, 19], 1)
([86, 21, 96, 91, 62, 36], 0)
([18, 1, 82, 64, 35, 62, 95], 1)
([86, 33, 90, 80, 26, 81], 0)
([31, 83, 62, 8, 27], 1)
([86, 14, 96, 97, 46, 64, 10, 62, 73], 0)
([72, 7, 65, 24, 62, 65, 51], 1)
([86, 89, 96, 16, 62, 65, 63, 96, 40], 0)
([34, 43, 99, 77, 41, 23, 37, 22, 87], 1)
([86, 32, 96, 71, 62, 65, 68, 96, 29], 0)
([34, 48, 20, 64, 65, 70], 1)
([86, 66, 79, 44, 62, 65, 69, 80, 94], 0)
([18, 17, 84, 15, 100, 42], 1)
([86, 33, 96, 59, 62, 5, 37, 60], 0)
([34, 61, 85, 55, 65, 9, 54], 1)
([86, 76, 47, 80, 75, 62, 13], 0)
([72, 52, 30, 74, 26, 6], 1)
([72, 0, 56, 30, 74, 53, 98], 0)


In [8]:
import torch.optim 

sentimentanalysis_model = SentimentAnalysisCNN(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(sentimentanalysis_model.parameters(), lr=0.1)

epochs = 30

for epoch in range(epochs):
    for sentence, label in data:
        sentimentanalysis_model.zero_grad()
        sentence = torch.LongTensor([word_to_idx.get(w, 0) for w in sentence]).unsqueeze(0)
        outputs = sentimentanalysis_model(sentence)
        label = torch.LongTensor([int(label)])
        loss = criterion(outputs, label)
        loss.backward()
        optimizer.step()
    print(f"epoch: {epoch}, train loss: {loss}")

epoch: 0, train loss: 0.7575240731239319
epoch: 1, train loss: 0.7340717911720276
epoch: 2, train loss: 0.7250593900680542
epoch: 3, train loss: 0.722136914730072
epoch: 4, train loss: 0.7211416959762573
epoch: 5, train loss: 0.7207779884338379
epoch: 6, train loss: 0.7206364870071411
epoch: 7, train loss: 0.7204015254974365
epoch: 8, train loss: 0.7202353477478027
epoch: 9, train loss: 0.7202185988426208
epoch: 10, train loss: 0.7202701568603516
epoch: 11, train loss: 0.7201457023620605
epoch: 12, train loss: 0.7202916741371155
epoch: 13, train loss: 0.719943106174469
epoch: 14, train loss: 0.7199276089668274
epoch: 15, train loss: 0.7202886939048767
epoch: 16, train loss: 0.7204899787902832
epoch: 17, train loss: 0.7199614644050598
epoch: 18, train loss: 0.7204902768135071
epoch: 19, train loss: 0.7207635045051575
epoch: 20, train loss: 0.7199158668518066
epoch: 21, train loss: 0.7199355363845825
epoch: 22, train loss: 0.7207097411155701
epoch: 23, train loss: 0.721081554889679
epoch

In [11]:
word_to_idx = {word: i for i, word in enumerate(tokens)}
vocab_size = len(tokens)
embedding_dim = 10
book_samples = [
    ("The story was captivating and kept me hooked until the end.".split(), 1),
    ("I found the characters shallow and the plot predictable".split(), 0)
]

In [12]:
book_reviews = [
    "I love this movie".split(),
    "I do not like this movie".split()
]
for review in book_reviews:
    # Convert the review words into tensor form
    input_tensor = torch.LongTensor([word_to_idx.get(w, 0) for w in sentence]).unsqueeze(0)
    #input_tensor = torch.LongTensor([word_to_idx[w] for w in review], dtype=torch.long).unsqueeze(0) 
    # Get the model's output
    outputs = sentimentanalysis_model(input_tensor)
    # Find the index of the most likely sentiment category
    _, predicted_label = torch.max(outputs.data, 1)
    # Convert the predicted label into a sentiment string
    sentiment = "Positive" if predicted_label.item() == 1 else "Negative"
    print(f"Book Review: {' '.join(review)}")
    print(f"Sentiment: {sentiment}\n")

Book Review: I love this movie
Sentiment: Negative

Book Review: I do not like this movie
Sentiment: Negative



In [16]:
from transformers import pipeline
import pandas as pd

sentiment_pipeline = pipeline("sentiment-analysis")
sentiment = sentiment_pipeline("I love this movie")
print(f"Sentiment: {sentiment[0]['label']}")

classifier = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
outputs = classifier("I love this movie")
pd.DataFrame(outputs)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Unnamed: 0,label,score
0,5 stars,0.882212


In [17]:
name_entity_recognition = pipeline("ner", model="dslim/bert-base-NER")
outputs = name_entity_recognition("I love this movie")
pd.DataFrame(outputs)

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [19]:
reader = pipeline("question-answering")
question = "Where do I work?"
context = "My name is Sylvain and I work at Hugging Face in Brooklyn."
outputs = reader(question=question, context=context)
print(outputs)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.638591468334198, 'start': 33, 'end': 45, 'answer': 'Hugging Face'}


In [5]:
from datasets import list_datasets
#print(list_datasets('squad', split='train')[0])
all_datasets = list_datasets()
print(f"Derzeit sind {len(all_datasets)} Datensätze")
print(f"Die ersten 10 sind: {all_datasets[:10]}")

#from datasets import list_datasets
#datasets_list = datasets.list_datasets()
#print(datasets)

Derzeit sind 158297 Datensätze
Die ersten 10 sind: ['amirveyseh/acronym_identification', 'ade-benchmark-corpus/ade_corpus_v2', 'UCLNLP/adversarial_qa', 'Yale-LILY/aeslc', 'nwu-ctext/afrikaans_ner_corpus', 'fancyzhx/ag_news', 'allenai/ai2_arc', 'google/air_dialogue', 'komari6/ajgt_twitter_ar', 'legacy-datasets/allegro_reviews']


In [6]:
from datasets import load_dataset
emotions = load_dataset("emotion")
print(emotions)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/3.97k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.28k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.78k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/592k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [7]:
train_ds = emotions['train']
train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 16000
})

In [9]:
print(train_ds[:5])
print(train_ds.column_names)

{'text': ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'i am feeling grouchy'], 'label': [0, 0, 3, 2, 3]}
['text', 'label']


In [10]:
import pandas as pd
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head(10)

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3
5,ive been feeling a little burdened lately wasn...,0
6,ive been taking or milligrams or times recomme...,5
7,i feel as confused about life as a teenager or...,4
8,i have been with petronas for years i feel tha...,1
9,i feel romantic too,2


In [11]:
def label_int2str(row):
    return emotions['train'].features['label'].int2str(row)

In [12]:
df['label_name'] = df['label'].apply(label_int2str)
df.head()

Unnamed: 0,text,label,label_name
0,i didnt feel humiliated,0,sadness
1,i can go from feeling so hopeless to so damned...,0,sadness
2,im grabbing a minute to post i feel greedy wrong,3,anger
3,i am ever feeling nostalgic about the fireplac...,2,love
4,i am feeling grouchy,3,anger
