# NLP Exercise 2: Neural Network
---

## Going from raw text to tokens

### Loading necessary libraries.

In [None]:
%pip install transformers
%pip install datasets
%pip install tokenizers
%pip install tqdm

In [37]:
import datasets

### Create a dataset

Using Datasets from Hugging Face could be useful when we create our own dataset.

In [31]:
def create_dataset(raw_sentences: list[str]) -> datasets.Dataset:
    """
    Create a HuggingFace Dataset.
    
    Parameters: 
        raw_sentences: list of sentences.
        labels: list of integer labels corresponding to the sentences.

    """

    dataset_dict = {
        "text": raw_sentences,

    }

    # Define schema
    dataset_features = datasets.Features(
        {
            "text": datasets.Value("string"),
        }
    )

    # Create the datset
    dataset = datasets.Dataset.from_dict(dataset_dict, features=dataset_features)
    return dataset

In [32]:
raw_sentences = ["I am studying NLP", 
                 "I am living in Finland and I love walking through the forests.", 
                 "The weather is getting cold. Great!"]

own_dataset = create_dataset(raw_sentences)

print(own_dataset.to_pandas())

                                                text
0                                  I am studying NLP
1  I am living in Finland and I love walking thro...
2                The weather is getting cold. Great!


### Tokenization

Differences between WordPiece tokenization and wordLevel tokenizer.

In [4]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tokenizers.processors import TemplateProcessing
from transformers import PreTrainedTokenizerFast

Function to train WordLevel Tokenizer

In [5]:
def train_word_level_tokenizer(
        sentences: list[str],
        unk_token: str = "[UNK]",
        pad_token: str = "[PAD]",
        start_of_seq_token: str = "<s>",
        end_of_seq_token: str = "</s>", 
        vocab_size: int = 100
) -> PreTrainedTokenizerFast:
    """Train a WordLevel tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordLevelTrainer(vocab_size=vocab_size,
                                        special_tokens=special_tokens, 
                                        show_progress=True)

    # Initialize WordLevel tokenizer
    tokenizer = Tokenizer(models.WordLevel(unk_token=unk_token))

    # Normalize each sentence using NFD unicode and stripping whitespace
    tokenizer.normalizer = normalizers.Sequence(
        [normalizers.NFD(), normalizers.Strip()]
    )

    # Using Whitespace to split each input sentence
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer

Function to train WordPiece Tokenizer

In [6]:
def train_wordpiece_tokenizer(
    sentences: list[str],
    unk_token: str = "[UNK]",
    pad_token: str = "[PAD]",
    start_of_seq_token: str = "<s>",
    end_of_seq_token: str = "</s>",
    vocab_size: int = 100  # Set a smaller vocab size to force subword splits
) -> PreTrainedTokenizerFast:
    """Train a WordPiece tokenizer."""
    special_tokens = [unk_token, pad_token, start_of_seq_token, end_of_seq_token]
    trainer = trainers.WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=special_tokens,
        show_progress=True
    )

    # Initialize WordPiece tokenizer
    tokenizer = Tokenizer(models.WordPiece(unk_token=unk_token))

    # Configure normalization and pre-tokenization
    tokenizer.normalizer = normalizers.Sequence([normalizers.NFD(), normalizers.Strip()])
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

    # Post-process for sequence boundaries
    tokenizer.post_processor = TemplateProcessing(
        single=f"{start_of_seq_token} $A {end_of_seq_token}",
        special_tokens=[
            (start_of_seq_token, special_tokens.index(start_of_seq_token)),
            (end_of_seq_token, special_tokens.index(end_of_seq_token)),
        ],
    )

    # Train tokenizer
    tokenizer.train_from_iterator(sentences, trainer=trainer)

    # Enable padding
    tokenizer.enable_padding(pad_id=special_tokens.index(pad_token), pad_token=pad_token)

    # Wrap in PreTrainedTokenizerFast
    pretrained_tokenizer = PreTrainedTokenizerFast(
        bos_token=start_of_seq_token,
        eos_token=end_of_seq_token,
        unk_token=unk_token,
        pad_token=pad_token,
        tokenizer_object=tokenizer,
    )
    return pretrained_tokenizer


Train both tokenizers

In [7]:
word_level_tokenizer = train_word_level_tokenizer(raw_sentences)
wordpiece_tokenizer = train_wordpiece_tokenizer(raw_sentences)

In [8]:
# Print tokenized results for raw sentences using convert_ids_to_tokens
for sentence in raw_sentences:
    input_ids = word_level_tokenizer.encode(sentence)
    tokens = word_level_tokenizer.convert_ids_to_tokens(input_ids)
    print("Sentence:", sentence)
    print("Tokens:", tokens)
    print("Token IDs:", input_ids)

Sentence: I am studying NLP
Tokens: ['<s>', 'I', 'am', 'studying', 'NLP', '</s>']
Token IDs: [2, 4, 6, 20, 10, 3]
Sentence: I am living in Finland and I love walking through the forests.
Tokens: ['<s>', 'I', 'am', 'living', 'in', 'Finland', 'and', 'I', 'love', 'walking', 'through', 'the', 'forests', '.', '</s>']
Token IDs: [2, 4, 6, 18, 16, 8, 12, 4, 19, 23, 22, 21, 14, 5, 3]
Sentence: The weather is getting cold. Great!
Tokens: ['<s>', 'The', 'weather', 'is', 'getting', 'cold', '.', 'Great', '!', '</s>']
Token IDs: [2, 11, 24, 17, 15, 13, 5, 9, 7, 3]


In [9]:
# Print tokenized results for raw sentences using convert_ids_to_tokens
for sentence in raw_sentences:
    input_ids = wordpiece_tokenizer.encode(sentence)
    tokens = wordpiece_tokenizer.convert_ids_to_tokens(input_ids)
    print("Sentence:", sentence)
    print("Tokens:", tokens)
    print("Token IDs:", input_ids)

Sentence: I am studying NLP
Tokens: ['<s>', 'I', 'am', 'st', '##ud', '##ying', 'NLP', '</s>']
Token IDs: [2, 8, 55, 71, 78, 80, 94, 3]
Sentence: I am living in Finland and I love walking through the forests.
Tokens: ['<s>', 'I', 'am', 'living', 'in', 'Finland', 'and', 'I', 'love', 'wa', '##lk', '##ing', 'th', '##roug', '##h', 'the', 'fores', '##ts', '.', '</s>']
Token IDs: [2, 8, 55, 98, 67, 92, 63, 8, 99, 74, 89, 53, 72, 87, 40, 73, 96, 77, 5, 3]
Sentence: The weather is getting cold. Great!
Tokens: ['<s>', 'The', 'weat', '##her', 'is', 'gett', '##ing', 'cold', '.', 'Great', '!', '</s>']
Token IDs: [2, 62, 75, 91, 68, 97, 53, 95, 5, 93, 4, 3]


In [10]:
# Test both tokenizers on a sample sentence
sample_text = "I want to live and study here."

# Word-Level Tokenizer Encoding
word_level_output = word_level_tokenizer(sample_text)
print("Word-Level Tokens:", word_level_tokenizer.convert_ids_to_tokens(word_level_output['input_ids']))
print("Word-Level Token IDs:", word_level_output['input_ids'])

# WordPiece Tokenizer Encoding
wordpiece_output = wordpiece_tokenizer(sample_text)
print("WordPiece Tokens:", wordpiece_tokenizer.convert_ids_to_tokens(wordpiece_output['input_ids']))
print("WordPiece Token IDs:", wordpiece_output['input_ids'])

Word-Level Tokens: ['<s>', 'I', '[UNK]', '[UNK]', '[UNK]', 'and', '[UNK]', '[UNK]', '.', '</s>']
Word-Level Token IDs: [2, 4, 0, 0, 0, 12, 0, 0, 5, 3]
WordPiece Tokens: ['<s>', 'I', 'wa', '##n', '##t', 't', '##o', 'li', '##ve', 'and', 'st', '##ud', '##y', 'h', '##e', '##r', '##e', '.', '</s>']
WordPiece Token IDs: [2, 8, 74, 38, 33, 28, 43, 69, 83, 63, 71, 78, 36, 19, 41, 46, 41, 5, 3]


## LTSM Model

### Import Libraries

In [1]:
from transformers import BertTokenizer
from datasets import load_dataset
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import random
import torch.optim as optim
from tqdm import tqdm

### Define the WikiDataset Class

In [None]:
class WikiDataset(Dataset):
    def __init__(self, sentences, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.masked_sentences = []  
        self.original_sentences = [] 

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        original_sentence = self.sentences[idx]
        words = original_sentence.split()
        mask_index = random.randint(1, len(words) - 2)  # Avoid masking first or last word
        target = self.tokenizer.convert_tokens_to_ids(words[mask_index])
        words[mask_index] = '[MASK]'
        masked_sentence = ' '.join(words)

        # Store sentences for display
        self.original_sentences.append(original_sentence)
        self.masked_sentences.append(masked_sentence)

        # Tokenize the masked sentence
        inputs = self.tokenizer(masked_sentence, return_tensors='pt', max_length=self.max_length,
                                padding='max_length', truncation=True)
        
        input_ids = inputs['input_ids'].squeeze(0)  # Remove the batch dimension
        attention_mask = inputs['attention_mask'].squeeze(0)

        return input_ids, attention_mask, target

    def get_display_sentences(self, index):
        return self.original_sentences[index], self.masked_sentences[index]


In [3]:
class MaskedLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(MaskedLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        mask_out = lstm_out[:, x.size(1) // 2, :]  # assuming the mask is in the middle
        logits = self.fc(mask_out)
        return logits


### Load the Wiki Dataset

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_sentences = [line for line in dataset['train']['text'] if len(line.split()) > 5]

train_dataset = WikiDataset(train_sentences, tokenizer)
data_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

vocab_size = tokenizer.vocab_size
embedding_dim = 128
hidden_dim = 256
model = MaskedLSTM(vocab_size, embedding_dim, hidden_dim)


In [5]:
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5, lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
# Assuming model, optimizer, criterion are already defined
num_epochs = 5
batch_size = 32

for epoch in tqdm(range(num_epochs), desc="Epochs"):
    epoch_loss = 0
    for input_ids, attention_mask, targets in tqdm(data_loader, desc="Training", leave=False):
        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch {epoch + 1}: Loss {epoch_loss / len(data_loader):.4f}")