In [None]:
from neuralnetlib.preprocessing import NGram, CountVectorizer, Tokenizer

import re
import numpy as np

## N-Gram

In [2]:
with open('dinos.txt', 'r', encoding='utf-8') as f:
    names = [line.strip() for line in f]

print(names[:5])  # display the first 5 names of the list to check if they were loaded correctly

['Aachenosaurus', 'Aardonyx', 'Abdallahsaurus', 'Abelisaurus', 'Abrictosaurus']


In [9]:
model = NGram(n=3, token_type="char")
model.fit(names)

print("Generated names using the model:\n")

generated_names = model.generate_sequences(
    n_sequences=10,
    min_length=5,
    max_length=15
)

for name in generated_names:
    print(name)
    

for name in generated_names:
    assert name not in names
print("\nAll generated names are unique!")

Generated names using the model:

Limicosaurus
Sucaraptosaurus
Kitang
Kitalchecon
Walus
Notitaveirus
Elmaryosaurus
Eocephosaurus
Koptosaurus
Salasaurus

All generated names are unique!


## Count Vectorizer

In [None]:
documents = [
    "The quick brown fox jumps over the lazy dog",
    "Never jump over the lazy dog quickly",
    "The quick brown fox is very quick"
]

vectorizer = CountVectorizer(lowercase=True, max_df=1.0, min_df=1, max_features=None)

X = vectorizer.fit_transform(documents)

print("Vocabulary:")
print(vectorizer.get_vocabulary())

print("\nFeature Names:")
print(vectorizer.get_feature_names_out())

print("\nTransformed Matrix:")
print(X)

new_documents = [
    "The quick brown fox",
    "The lazy dog sleeps"
]
X_new = vectorizer.transform(new_documents)

print("\nTransformed New Documents:")
print(X_new)

Vocabulary:
{'brown': 0, 'dog': 1, 'fox': 2, 'is': 3, 'jump': 4, 'jumps': 5, 'lazy': 6, 'never': 7, 'over': 8, 'quick': 9, 'quickly': 10, 'the': 11, 'very': 12}

Feature Names:
['brown' 'dog' 'fox' 'is' 'jump' 'jumps' 'lazy' 'never' 'over' 'quick'
 'quickly' 'the' 'very']

Transformed Matrix:
[[1 1 1 0 0 1 1 0 1 1 0 2 0]
 [0 1 0 0 1 0 1 1 1 0 1 1 0]
 [1 0 1 1 0 0 0 0 0 2 0 1 1]]

Transformed New Documents:
[[1 0 1 0 0 0 0 0 0 1 0 1 0]
 [0 1 0 0 0 0 1 0 0 0 0 1 0]]


## Tokenizer

In [11]:
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Never jump over the lazy dog quickly.",
    "The quick brown fox is very quick."
]

print("Testing 'char' mode:")
tokenizer_char = Tokenizer(mode='char')
tokenizer_char.fit_on_texts(texts)
char_sequences = tokenizer_char.texts_to_sequences(texts)
print("Sequences (char mode):", char_sequences)
print("Vocabulary size:", tokenizer_char.get_vocab_size())
print("Reconstructed text:", tokenizer_char.sequences_to_texts(char_sequences), "\n")

print("Testing 'word' mode:")
tokenizer_word = Tokenizer(mode='word')
tokenizer_word.fit_on_texts(texts)
word_sequences = tokenizer_word.texts_to_sequences(texts)
print("Sequences (word mode):", word_sequences)
print("Vocabulary size:", tokenizer_word.get_vocab_size())
print("Reconstructed text:", tokenizer_word.sequences_to_texts(word_sequences), "\n")

print("Testing 'bpe' mode:")
tokenizer_bpe = Tokenizer(mode='bpe', bpe_merges=10)
tokenizer_bpe.fit_on_texts(texts)
bpe_sequences = tokenizer_bpe.texts_to_sequences(texts)
print("Sequences (BPE mode):", bpe_sequences)
print("Vocabulary size:", tokenizer_bpe.get_vocab_size())
print("Reconstructed text:", tokenizer_bpe.sequences_to_texts(bpe_sequences), "\n")

Testing 'char' mode:
Sequences (char mode): [[2, 10, 11, 5, 4, 12, 7, 9, 13, 14, 4, 19, 8, 6, 20, 17, 4, 21, 6, 22, 4, 23, 7, 24, 25, 26, 4, 6, 15, 5, 8, 4, 10, 11, 5, 4, 18, 27, 28, 16, 4, 29, 6, 30, 1, 3], [2, 17, 5, 15, 5, 8, 4, 23, 7, 24, 25, 4, 6, 15, 5, 8, 4, 10, 11, 5, 4, 18, 27, 28, 16, 4, 29, 6, 30, 4, 12, 7, 9, 13, 14, 18, 16, 1, 3], [2, 10, 11, 5, 4, 12, 7, 9, 13, 14, 4, 19, 8, 6, 20, 17, 4, 21, 6, 22, 4, 9, 26, 4, 15, 5, 8, 16, 4, 12, 7, 9, 13, 14, 1, 3]]
Vocabulary size: 31
Reconstructed text: ['<SOS>the quick brown fox jumps over the lazy dog<UNK><EOS>', '<SOS>never jump over the lazy dog quickly<UNK><EOS>', '<SOS>the quick brown fox is very quick<UNK><EOS>'] 

Testing 'word' mode:
Sequences (word mode): [[2, 4, 5, 6, 7, 11, 8, 4, 9, 1, 3], [2, 12, 13, 8, 4, 9, 10, 1, 3], [2, 4, 5, 6, 7, 15, 16, 1, 3]]
Vocabulary size: 17
Reconstructed text: ['<SOS> the quick brown fox jumps over the lazy <UNK> <EOS>', '<SOS> never jump over the lazy dog <UNK> <EOS>', '<SOS> the quick bro