# Question 1: Simple tokenizer

Write the code to 
(i) split text into characters?
(ii) split text into words 

sample_text = "Fulbright University Vietnam is a good school"

One text can be splitted into many token (one to many)

In [None]:
sample_text = "Fulbright University Vietnam is a good school"

# Word-level tokenization
word_tokens = sample_text.split()
print("Word-level tokens:")
print(word_tokens)
print(f"Number of word tokens: {len(word_tokens)}\n")

# Character-level tokenization
character_tokens = list(sample_text)
print("Character-level tokens:")
print(character_tokens)
print(f"Number of character tokens: {len(character_tokens)}")


['Fulbright', 'Univsersity', 'Vietnam', 'is', 'a', 'good', 'school']
['F', 'u', 'l', 'b', 'r', 'i', 'g', 'h', 't', ' ', 'U', 'n', 'i', 'v', 's', 'e', 'r', 's', 'i', 't', 'y', ' ', 'V', 'i', 'e', 't', 'n', 'a', 'm', ' ', 'i', 's', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ', 's', 'c', 'h', 'o', 'o', 'l']


# Question 2: How to handle special characters?

Chang the code to tokenize this sentence "Fulbright Univerisity Vietnam is a good school. However, it is too small."

In [5]:
import re

sentences = "Fulbright Univerisity Vietnam is a good school. However, it is too small."

# Sentence-level tokenization
sentence_tokens = re.split(r'[.!?]+', sentences)
sentence_tokens = [s.strip() for s in sentence_tokens if s.strip()]
print("Sentence-level tokens:")
print(sentence_tokens)
print(f"Number of sentences: {len(sentence_tokens)}\n")

# Word-level tokenization
word_tokens = re.findall(r'\b\w+\b', sentences)
print("Word-level tokens:")
print(word_tokens)
print(f"Number of word tokens: {len(word_tokens)}\n")

# Character-level tokenization
character_tokens = list(sentences)
print("Character-level tokens:")
print(character_tokens)
print(f"Number of character tokens: {len(character_tokens)}")


Sentence-level tokens:
['Fulbright Univerisity Vietnam is a good school', 'However, it is too small']
Number of sentences: 2

Word-level tokens:
['Fulbright', 'Univerisity', 'Vietnam', 'is', 'a', 'good', 'school', 'However', 'it', 'is', 'too', 'small']
Number of word tokens: 12

Character-level tokens:
['F', 'u', 'l', 'b', 'r', 'i', 'g', 'h', 't', ' ', 'U', 'n', 'i', 'v', 'e', 'r', 'i', 's', 'i', 't', 'y', ' ', 'V', 'i', 'e', 't', 'n', 'a', 'm', ' ', 'i', 's', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ', 's', 'c', 'h', 'o', 'o', 'l', '.', ' ', 'H', 'o', 'w', 'e', 'v', 'e', 'r', ',', ' ', 'i', 't', ' ', 'i', 's', ' ', 't', 'o', 'o', ' ', 's', 'm', 'a', 'l', 'l', '.']
Number of character tokens: 73


# Question 3: How to handle complex patterns?

Chang the code to tokenize this sentence "Fulbright Univerisity Vietnam is a good school. However, it is too small. Other schools (e.g. FTU, FPT, ...) are also good."

In [10]:
import re

text = "Fulbright Univerisity Vietnam is a good school. However, it is too small. Other schools (e.g. FTU, FPT, ...) are also good."

# Sentence-level tokenization (handle abbreviations better)
# Split on period followed by space and capital letter, or other punctuation
sentence_tokens = re.split(r'(?<=[.!?])\s+(?=[A-Z])', text)
print("Sentence-level tokens:")
for i, sent in enumerate(sentence_tokens, 1):
    print(f"  {i}. {sent}")
print(f"Number of sentences: {len(sentence_tokens)}\n")

# Word-level tokenization (keep alphanumeric, handles abbreviations and parentheses)
word_tokens = re.findall(r'\b\w+(?:\.\w+)?\b|\.\.\.|[()]', text)
print("Word-level tokens:")
print(word_tokens)
print(f"Number of word tokens: {len(word_tokens)}\n")

# Word tokens without special characters (but keeping abbreviations like e.g)
word_tokens_clean = re.findall(r'\b\w+(?:\.\w+)*\b', text)
print("Word-level tokens (clean - keeps abbreviations like e.g):")
print(word_tokens_clean)
print(f"Number of clean word tokens: {len(word_tokens_clean)}\n")

# Character-level tokenization
character_tokens = list(text)
print("Character-level tokens:")
print(character_tokens)
print(f"Number of character tokens: {len(character_tokens)}")


Sentence-level tokens:
  1. Fulbright Univerisity Vietnam is a good school.
  2. However, it is too small.
  3. Other schools (e.g.
  4. FTU, FPT, ...) are also good.
Number of sentences: 4

Word-level tokens:
['Fulbright', 'Univerisity', 'Vietnam', 'is', 'a', 'good', 'school', 'However', 'it', 'is', 'too', 'small', 'Other', 'schools', '(', 'e.g', 'FTU', 'FPT', '...', ')', 'are', 'also', 'good']
Number of word tokens: 23

Word-level tokens (clean - keeps abbreviations like e.g):
['Fulbright', 'Univerisity', 'Vietnam', 'is', 'a', 'good', 'school', 'However', 'it', 'is', 'too', 'small', 'Other', 'schools', 'e.g', 'FTU', 'FPT', 'are', 'also', 'good']
Number of clean word tokens: 20

Character-level tokens:
['F', 'u', 'l', 'b', 'r', 'i', 'g', 'h', 't', ' ', 'U', 'n', 'i', 'v', 'e', 'r', 'i', 's', 'i', 't', 'y', ' ', 'V', 'i', 'e', 't', 'n', 'a', 'm', ' ', 'i', 's', ' ', 'a', ' ', 'g', 'o', 'o', 'd', ' ', 's', 'c', 'h', 'o', 'o', 'l', '.', ' ', 'H', 'o', 'w', 'e', 'v', 'e', 'r', ',', ' ', '

# Question 4: GPT-2 tokenizer

In [2]:
from transformers import GPT2Tokenizer

# Load pretrained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Step 1: Take input string
text = "Fulbright University Vietnam(FUV) is the best"

# Step 2 and 3: Encode text into token IDs
token_ids = tokenizer.encode(text)

# Optional: decode back into tokens for clarity
tokens = [tokenizer.decode([tid]) for tid in token_ids]

print("Text:", text)
print("Tokens:", tokens)
print("Token IDs:", token_ids)

Text: Fulbright University Vietnam(FUV) is the best
Tokens: ['F', 'ul', 'bright', ' University', ' Vietnam', '(', 'F', 'UV', ')', ' is', ' the', ' best']
Token IDs: [37, 377, 29199, 2059, 10836, 7, 37, 31667, 8, 318, 262, 1266]


# Question 4

In [7]:
pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.9 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.12.0
Note: you may need to restart the kernel to use updated packages.


In [8]:
import tiktoken
# Compare GPT-2 vs GPT-4 tokenization
enc_gpt2 = tiktoken.get_encoding("gpt2")
enc_gpt4 = tiktoken.get_encoding("cl100k_base")

example = "Fulbright University Vietnam is the best!"

tokens_gpt2 = enc_gpt2.encode(example)
tokens_gpt4 = enc_gpt4.encode(example)

print(f"GPT-2 tokens: {len(tokens_gpt2)}")
print(f"GPT-4 tokens: {len(tokens_gpt4)}")

print(tokens_gpt2)
print(tokens_gpt4)

GPT-2 tokens: 9
GPT-4 tokens: 9
[37, 377, 29199, 2059, 10836, 318, 262, 1266, 0]
[37, 360, 73216, 3907, 23315, 374, 279, 1888, 0]
