# **Install Dependencies**

In [None]:
!pip uninstall -y torch torchvision torchtext torchaudio

In [None]:
!pip install torch==2.1.0+cu121 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
!pip install torchtext==0.16.0

In [None]:
!pip install "numpy<2"

In [None]:
!pip install torchmetrics

In [None]:
!pip install torchdata



---
# **Import Library**

In [1]:
import torch
import torchtext

In [2]:
print("Torch version:", torch.__version__)
print("Torchtext version:", torchtext.__version__)
print("CUDA available:", torch.cuda.is_available())


Torch version: 2.1.0+cu121
Torchtext version: 0.16.0+cpu
CUDA available: False


---
# **GPU**

In [None]:
!nvidia-smi

Thu Aug 14 18:03:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   43C    P3             10W /   35W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

---
# **Dataset**

In [2]:
from torchtext import datasets

In [None]:
!pip install portalocker

# Restart the runtime (important, otherwise torchtext still sees None)
import os, sys
os.kill(os.getpid(), 9)



In [3]:
train_set, test_set = datasets.AG_NEWS(root='/content/', split=('train', 'test'))

In [7]:
train_set, test_set

(ShardingFilterIterDataPipe, ShardingFilterIterDataPipe)

- **NLP Datasets:** ``` Iterable DataPipes```
- **Computer Vision Datasets:** ```Map-Style DataPipes```

In [4]:
next(iter(train_set))

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [None]:
train_iter = iter(train_set)
print(train_iter)

<generator object ShardingFilterIterDataPipe.__iter__ at 0x7b38092c3740>


In [None]:
next(train_iter)

(3,
 "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums.")

In [None]:
next(train_iter)

(3,
 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.')

---

# **DataLoader**

---
# **Pre-Process**

> ## Tokenizer



In [None]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer("basic_english")

In [None]:
test_sentence = "This is a test sentence."
tokens = tokenizer(test_sentence)
print(tokens)
print("Number of tokens:", len(tokens))

['this', 'is', 'a', 'test', 'sentence', '.']
Number of tokens: 6


In [None]:
split = test_sentence.split()
print(split)
print("Number of split tokens:", len(split))

['This', 'is', 'a', 'test', 'sentence.']
Number of split tokens: 5


As the below Result shows, using the tokenizer ```basic_english``` can tokenize the text very well in comparison to command ```split``` that couldn't tokenize somthing like ., !, ?, ...

> ## Vectorization

In [None]:
from torchtext.vocab import GloVe
from torchtext.vocab import build_vocab_from_iterator

### GloVe

In [None]:
vectorization = GloVe(name="6B", dim=100)
vectorization

<torchtext.vocab.vectors.GloVe at 0x7a672874c490>

In [None]:
vectorization.itos[:11]

['the', ',', '.', 'of', 'to', 'and', 'in', 'a', '"', "'s", 'for']

In [None]:
vectorization.stoi["and"]

5

In [None]:
vectorization.stoi

{'the': 0,
 ',': 1,
 '.': 2,
 'of': 3,
 'to': 4,
 'and': 5,
 'in': 6,
 'a': 7,
 '"': 8,
 "'s": 9,
 'for': 10,
 '-': 11,
 'that': 12,
 'on': 13,
 'is': 14,
 'was': 15,
 'said': 16,
 'with': 17,
 'he': 18,
 'as': 19,
 'it': 20,
 'by': 21,
 'at': 22,
 '(': 23,
 ')': 24,
 'from': 25,
 'his': 26,
 "''": 27,
 '``': 28,
 'an': 29,
 'be': 30,
 'has': 31,
 'are': 32,
 'have': 33,
 'but': 34,
 'were': 35,
 'not': 36,
 'this': 37,
 'who': 38,
 'they': 39,
 'had': 40,
 'i': 41,
 'which': 42,
 'will': 43,
 'their': 44,
 ':': 45,
 'or': 46,
 'its': 47,
 'one': 48,
 'after': 49,
 'new': 50,
 'been': 51,
 'also': 52,
 'we': 53,
 'would': 54,
 'two': 55,
 'more': 56,
 "'": 57,
 'first': 58,
 'about': 59,
 'up': 60,
 'when': 61,
 'year': 62,
 'there': 63,
 'all': 64,
 '--': 65,
 'out': 66,
 'she': 67,
 'other': 68,
 'people': 69,
 "n't": 70,
 'her': 71,
 'percent': 72,
 'than': 73,
 'over': 74,
 'into': 75,
 'last': 76,
 'some': 77,
 'government': 78,
 'time': 79,
 '$': 80,
 'you': 81,
 'years': 82,
 'i

### Giving Tokens to GloVe


In [None]:
vectorization.get_vecs_by_tokens(tokens)

tensor([[-5.7058e-01,  4.4183e-01,  7.0102e-01, -4.1713e-01, -3.4058e-01,
          2.3390e-02, -7.1537e-02,  4.8177e-01, -1.3121e-02,  1.6834e-01,
         -1.3389e-01,  4.0626e-02,  1.5827e-01, -4.4342e-01, -1.9403e-02,
         -9.6610e-03, -4.6284e-02,  9.3228e-02, -2.7331e-01,  2.2850e-01,
          3.3089e-01, -3.6474e-01,  7.8741e-02,  3.5850e-01,  4.4757e-01,
         -2.2990e-01,  1.8077e-01, -6.2650e-01,  5.3852e-02, -2.9154e-01,
         -4.2560e-01,  6.2903e-01,  1.4393e-01, -4.6004e-02, -2.1007e-01,
          4.8879e-01, -5.7698e-02,  3.7431e-01, -3.0075e-02, -3.4494e-01,
         -2.9702e-01,  1.5095e-01,  2.8248e-01, -1.6578e-01,  7.6131e-02,
         -9.3016e-02,  7.9365e-01, -6.0489e-01, -1.8874e-01, -1.0173e+00,
          3.1962e-01, -1.6344e-01,  5.4177e-01,  1.1725e+00, -4.7875e-01,
         -3.3842e+00, -8.1301e-02, -3.5280e-01,  1.8372e+00,  4.4516e-01,
         -5.2666e-01,  9.9786e-01, -3.2178e-01,  3.3462e-02,  1.1783e+00,
         -7.2905e-02,  3.9737e-01,  2.

In [None]:
vectorization.get_vecs_by_tokens(tokens).shape

torch.Size([6, 100])

we had a sentence of "This is a test sentence." that the tokenizer, tokenize it to 6 token. Also we choose the "dim=100" for the GloVe.

So the output correctly shows the sape of vectorization [6, 100].


### Check the Vectorization Results of different Words

In [None]:
vec_boy = vectorization.get_vecs_by_tokens('boy')
vec_girl = vectorization.get_vecs_by_tokens('girl')
vec_street = vectorization.get_vecs_by_tokens('street')

In [None]:
import torch.nn.functional as F
print("--- Cosine Similarity ---")
print("boy VS. girl:", F.cosine_similarity(vec_boy, vec_girl, dim=0))
print("boy VS. street:", F.cosine_similarity(vec_boy, vec_street, dim=0))
print("girl VS. street:", F.cosine_similarity(vec_girl, vec_street, dim=0))

--- Cosine Similarity ---
boy VS. girl: tensor(0.9176)
boy VS. street: tensor(0.3859)
girl VS. street: tensor(0.3354)


As the results show, the vectorization outputs are correct because the `boy` and the `girl` are more similar to each other compared to the `street`, since both are human/family-related.

> ## Transform ( Optional - just for Test )

In [None]:
import torchtext.transforms as T

- ### BERT Tokenizer

In [None]:
VOCAB_FILE = "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt"

tokenizer_BERT = T.BERTTokenizer(vocab_path=VOCAB_FILE,
                            do_lower_case=True)

print("\n",tokenizer_BERT('Hello World, How are you!'))
print(tokenizer_BERT(['Hello World, How are you!', 'hi there.']))

100%|██████████| 232k/232k [00:00<00:00, 5.42MB/s]


 ['7592', '2088', '1010', '2129', '2024', '2017', '999']
[['7592', '2088', '1010', '2129', '2024', '2017', '999'], ['7632', '2045', '1012']]





In [None]:
tokenizer_BERT = T.BERTTokenizer(vocab_path=VOCAB_FILE,
                            do_lower_case=True,
                            return_tokens=True)

print(tokenizer_BERT('Hello World, How are you!'))
print(tokenizer_BERT(['Hello World, How are you!', 'hi there.']))

100%|██████████| 232k/232k [00:00<00:00, 4.61MB/s]


['hello', 'world', ',', 'how', 'are', 'you', '!']
[['hello', 'world', ',', 'how', 'are', 'you', '!'], ['hi', 'there', '.']]


- ### ToTensor

In [None]:
sent1 = 'hi there.'
sent2 = 'Hello World, How are you!'

In [None]:
tokens = tokenizer_BERT([sent1, sent2])

token_ids1 = [vectorization.stoi[token] for token in tokens[0]]
token_ids2 = [vectorization.stoi[token] for token in tokens[1]]
token_ids = [token_ids1, token_ids2]
token_ids

[[11083, 63, 2], [13075, 85, 1, 197, 32, 81, 805]]

In [None]:
to_tensor = T.ToTensor(padding_value=0)
to_tensor(token_ids)

tensor([[11083,    63,     2,     0,     0,     0,     0],
        [13075,    85,     1,   197,    32,    81,   805]])

- ### Truncate

In [None]:
print(tokens[1])
T.Truncate(max_seq_len=3)(tokens[1])

['hello', 'world', ',', 'how', 'are', 'you', '!']


['hello', 'world', ',']

- ### Sequential

In [None]:
tr = T.Sequential(T.BERTTokenizer(vocab_path=VOCAB_FILE,
                            do_lower_case=True,
                            return_tokens=True),
                  T.Truncate(max_seq_len=3))
tr

100%|██████████| 232k/232k [00:00<00:00, 5.49MB/s]


Sequential(
  (0): BERTTokenizer()
  (1): Truncate()
)

In [None]:
print(sent2)
tr(sent2)

Hello World, How are you!


['hello', 'world', ',']

> ## Utils - Ngrams

In [None]:
from torchtext.data.utils import ngrams_iterator

In [None]:
sent3 = 'Return an iterator that yields the given tokens and their ngrams.'
tokens = tokenizer_BERT(sent3)
print(tokens)

['return', 'an', 'it', '##era', '##tor', 'that', 'yields', 'the', 'given', 'token', '##s', 'and', 'their', 'ng', '##ram', '##s', '.']


In [None]:
sent3 = 'Return an iterator that yields the given tokens and their ngrams.'
tokens = tokenizer(sent3)
print(tokens)

['return', 'an', 'iterator', 'that', 'yields', 'the', 'given', 'tokens', 'and', 'their', 'ngrams', '.']


In [None]:
ngrams_iterator(tokens, ngrams=3)

<generator object ngrams_iterator at 0x7a671c61f740>

In [None]:
list(ngrams_iterator(tokens, ngrams=3))

['return',
 'an',
 'iterator',
 'that',
 'yields',
 'the',
 'given',
 'tokens',
 'and',
 'their',
 'ngrams',
 '.',
 'return an',
 'an iterator',
 'iterator that',
 'that yields',
 'yields the',
 'the given',
 'given tokens',
 'tokens and',
 'and their',
 'their ngrams',
 'ngrams .',
 'return an iterator',
 'an iterator that',
 'iterator that yields',
 'that yields the',
 'yields the given',
 'the given tokens',
 'given tokens and',
 'tokens and their',
 'and their ngrams',
 'their ngrams .']