In [2]:
import os

# Path to the dataset file
data_file_path = os.path.join("data", "data.txt")

# Read the poem dataset
with open(data_file_path, "r") as file:
    poem_text = file.read()

print("Loaded Poem Dataset:")
print(poem_text)


Loaded Poem Dataset:
Roses are red,
Violets are blue,
The sky is clear,
And so are you.

The moon is bright,
The stars do shine,
The night is calm,
A perfect sign.


In [53]:
import os
data_file_path = os.path.join("data", "data.txt")
with open(data_file_path, "r") as file:
    poem_text = file.read()
words = poem_text.split(" ")
len(words)
set_word = set(words)




In [61]:
import numpy as np
from collections import Counter

def get_pair_frequencies(tokens):
    """Compute frequencies of adjacent token pairs."""
    pairs = [tuple(tokens[i:i+2]) for i in range(len(tokens) - 1)]
    return Counter(pairs)

def merge_pair(tokens, pair, new_token):
    """Merge the most frequent pair in the token list."""
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and (tokens[i], tokens[i+1]) == pair:
            merged_tokens.append(new_token)
            i += 2  # Skip next token as it's merged
        else:
            merged_tokens.append(tokens[i])
            i += 1
    return merged_tokens

def byte_pair_encoding(text, num_merges):
    """Train BPE and return tokens, vocab, and learned merges."""
    tokens = list(text)  # Initialize as character tokens
    vocab = set(tokens)  # Initial vocabulary
    merges = {}  # Store merge rules

    for _ in range(num_merges):
        pair_freqs = get_pair_frequencies(tokens)
        if not pair_freqs:
            break
        most_frequent_pair = max(pair_freqs, key=pair_freqs.get)

        new_token = "".join(most_frequent_pair)  # Merge pair into a new token
        vocab.add(new_token)  # Add new token to vocabulary
        merges[most_frequent_pair] = new_token  # Store merge step

        tokens = merge_pair(tokens, most_frequent_pair, new_token)

    return tokens, vocab, merges

def tokenize_with_bpe(text, merges):
    """Tokenizes text using learned BPE merges."""
    tokens = list(text)  # Start with character-level tokens

    # Apply stored merges in order
    while True:
        pair_freqs = get_pair_frequencies(tokens)
        if not pair_freqs:
            break
        
        merge_candidates = [(pair, merges[pair]) for pair in pair_freqs if pair in merges]
        if not merge_candidates:
            break

        # Apply the first merge in stored order
        pair_to_merge, new_token = merge_candidates[0]
        tokens = merge_pair(tokens, pair_to_merge, new_token)

    return tokens

# Example usage
text = poem_text
num_merges = 300
tokens, vocab, merges = byte_pair_encoding(text, num_merges)
toks = tokenize_with_bpe(text=poem_text, merges=merges)
print(toks)
tok_idx = [tokens.index(i) for i in toks]
# print("Final Tokens:", tokens)
# print("Final Vocabulary:", vocab)


['I', 's ', 'th', 'i', 's ', 'th', 'e ', 'rea', 'l', ' ', 'life ', 'I', 's ', 'th', 'i', 's ', 'just ', 'f', 'an', 't', 'as', 'y\n', 'C', 'a', 'u', 'ght ', 'in', ' a', ' ', 'l', 'and', 's', 'li', 'd', 'e, ', 'no', ' ', 'es', 'ca', 'p', 'e ', 'from', ' rea', 'li', 't', 'y\n', 'O', 'p', 'en ', 'your ', 'ey', 'es', ', ', 'lo', 'o', 'k ', 'up', ' to', ' ', 'th', 'e ', 's', 'k', 'i', 'es ', 'and', ' s', 'ee', '\n', "I'm just a poor bo", 'y, ', 'I ', 'n', 'ee', 'd ', 'no', ' s', 'y', 'm', 'p', 'a', 'th', 'y\n', 'B', 'e', 'ca', 'us', 'e ', "I'm ", 'easy ', 'come, easy go, ', 'little ', 'hi', 'gh', ', ', 'little ', 'low', '\n', 'Any w', 'a', 'y ', 'th', 'e ', 'w', 'in', 'd ', 'b', 'low', 's do', 'es', "n't ", 'rea', 'll', 'y ', 'm', 'a', 'tt', 'er', ' to', ' me, ', 't', 'o me\n', '\nMama, ', 'just ', 'k', 'ill', 'e', 'd', ' a', ' m', 'an', '\n', 'P', 'ut ', 'a ', 'gun', ' again', 's', 't ', 'his ', 'h', 'ead', ', ', 'p', 'u', 'll', 'e', 'd ', 'm', 'y t', 'ri', 'g', 'g', 'er', ', no', 'w ', 'h'

ValueError: 'gh' is not in list

In [62]:
len(vocab)

352

In [63]:
tokens

['Is this the real life Is this just fantasy\nCaught in',
 ' ',
 'a ',
 'l',
 'and',
 's',
 'li',
 'd',
 'e, ',
 'no',
 ' ',
 'es',
 'ca',
 'p',
 'e ',
 'from',
 ' rea',
 'li',
 't',
 'y\n',
 'O',
 'p',
 'en ',
 'your ',
 'ey',
 'es',
 ', ',
 'lo',
 'o',
 'k ',
 'up',
 ' to',
 ' the ',
 's',
 'k',
 'i',
 'es ',
 'and s',
 'ee\n',
 "I'm just a poor bo",
 'y, ',
 'I ',
 'n',
 'ee',
 'd ',
 'no',
 ' s',
 'y',
 'm',
 'p',
 'a',
 'th',
 'y\n',
 'B',
 'e',
 'ca',
 'us',
 'e ',
 "I'm ",
 'easy ',
 'come, easy go, ',
 'little ',
 'hi',
 'g',
 'h, ',
 'little ',
 'low',
 '\n',
 'Any way the wind blow',
 's do',
 'es',
 "n't ",
 'rea',
 'lly matter',
 ' to',
 ' me, ',
 't',
 'o me\n',
 '\nMama, ',
 'just ',
 'k',
 'ill',
 'e',
 'd ',
 'a m',
 'an',
 '\n',
 'P',
 'ut ',
 'a ',
 'gun',
 ' again',
 's',
 't ',
 'his ',
 'h',
 'ead',
 ', ',
 'p',
 'u',
 'll',
 'e',
 'd',
 ' m',
 'y t',
 'ri',
 'g',
 'g',
 'er',
 ', no',
 'w ',
 'h',
 "e's ",
 'd',
 'ead',
 '\nMama, ',
 'life ',
 'ha',
 'd ',
 'just 

In [60]:
import pandas as pd
df = pd.read_csv("/Users/hamza.mahmood/Developer/deep_learning_course/LSTM-from-Scratch-using-Numpy/data/5/csv/PostMalone.csv")
# combined_string = "\n".join(df["Lyric"].astype(str))
# print(combined_string)
df

Unnamed: 0.1,Unnamed: 0,Artist,Title,Album,Year,Date,Lyric
0,0,Post Malone,​​rockstar,beerbongs & bentleys,2017.0,2017-09-15,post malone hahahahaha tank god ayy ayy post...
1,1,Post Malone,White Iverson,Stoney (Deluxe),2015.0,2015-02-04,double ot i'm a new three saucin' saucin' i'...
2,2,Post Malone,Congratulations,Stoney (Deluxe),2016.0,2016-11-04,post malone mmmmm yeah yeah mmmmm yeah hey p...
3,3,Post Malone,Psycho,beerbongs & bentleys,2018.0,2018-02-23,post malone damn my ap goin' psycho lil' mama ...
4,4,Post Malone,I Fall Apart,Stoney (Deluxe),2016.0,2016-12-09,ooh i fall apart ooh yeah mmm yeah she told ...
...,...,...,...,...,...,...,...
143,143,Post Malone,Lithium,,2020.0,2020-04-24,i'm so happy 'cause today i found my friends t...
144,144,Post Malone,Something in the Way,,2020.0,2020-04-24,verse underneath the tarp has sprung a leak a...
145,145,Post Malone,In Bloom,,2020.0,2020-04-24,sell the kids for food weather changes moods s...
146,146,Post Malone,Territorial Pissings,,2020.0,2020-04-24,when i was an alien cultures weren't opinions ...


In [58]:
with open("/Users/hamza.mahmood/Developer/deep_learning_course/LSTM-from-Scratch-using-Numpy/data/postmalone.txt", "w") as f:
    f.write(combined_string)

In [98]:
from src.basic import BasicTokenizer
tokenizer = BasicTokenizer()
text = poem_text
tokenizer.train(text, 256 + 200) # 256 are the byte tokens, then do 3 merges
tokens = tokenizer.encode(poem_text)
# print(tokenizer.decode(tokens[5:10]))
# print(len(tokens))

for tok in set(tokens):
    print(tok, tokenizer.decode([tok]))
# tokenizer.save("toy")
# writes two files: toy.model (for loading) and toy.vocab (for viewing)

10 

32  
40 (
65 A
66 B
67 C
68 D
69 E
70 F
71 G
72 H
73 I
74 J
76 L
78 N
79 O
80 P
83 S
84 T
87 W
89 Y
91 [
93 ]
97 a
98 b
99 c
100 d
101 e
102 f
103 g
104 h
105 i
107 k
108 l
109 m
110 n
111 o
112 p
114 r
115 s
116 t
117 u
118 v
119 w
121 y
122 z
256 , 
257 t 
258 e 
259  m
260  t
261 ou
262 ll
263 hi
264 go
265 an
266 no
267 er
268 ea
269 li
270 s 
271 you
272 y 
273 et 
274 e

275 or
276 a 
277 d 
278 tt
279 e, 
280 wi
281 you 
282 om
283 h, 
284 am
285 me 
286 o 
287 us
288 ll 
289 on
290 ver
291 go

292  th
293 ust 
294 y

295 in
296 lo
297 or 
298 , no
299 ar
300 hin
301 ) 
303  the 
304 rea
305 just 
306 I'
308 bo
309  ma
310  s
312 a, 
313  a
314 ve 
315 not 
316 ever
317 s t
318 gh
319  rea
320 and 
324 lly matter
325  me

326 Mam
327 ut 
328 hing
329 oh
330 will 
334 Galile
338 not let you go
(
339 go) 
340 , no, no
341 es
342 fr
343 en
344 I'm 
347 a poor 
348 y, 
349 sy 
350 com
351 low
352 do
353 't 
354  me, 
355 Mama, 
356 ill
357 a m
358 ro
361 thing really matter
362

In [97]:
set(tokenizer.encode(poem_text))

{10,
 32,
 40,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 76,
 78,
 79,
 80,
 83,
 84,
 87,
 89,
 91,
 93,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 107,
 108,
 109,
 110,
 111,
 112,
 114,
 115,
 116,
 117,
 118,
 119,
 121,
 122,
 256,
 257,
 258,
 259,
 260,
 261,
 262,
 263,
 264,
 265,
 266,
 267,
 268,
 269,
 270,
 271,
 272,
 273,
 274,
 275,
 276,
 277,
 278,
 279,
 280,
 281,
 282,
 283,
 284,
 285,
 286,
 287,
 288,
 289,
 290,
 291,
 292,
 293,
 294,
 295,
 296,
 297,
 298,
 299,
 300,
 301,
 303,
 304,
 305,
 306,
 308,
 309,
 310,
 312,
 313,
 314,
 315,
 316,
 317,
 318,
 319,
 320,
 324,
 325,
 326,
 327,
 328,
 329,
 330,
 334,
 338,
 339,
 340,
 341,
 342,
 343,
 344,
 347,
 348,
 349,
 350,
 351,
 352,
 353,
 354,
 355,
 356,
 357,
 358,
 361,
 362,
 363,
 364,
 365,
 366,
 367,
 368,
 369,
 371,
 372,
 373,
 375,
 376,
 377,
 378,
 379,
 380,
 381,
 383,
 384,
 385,
 386,
 389,
 399,
 400,
 401,
 402,
 403,
 404,
 405,
 406,
 407,
 408,
 409,
 410,


In [95]:
import numpy as np

vocab_size = 5
probs = np.array([0.1, 0.2, 0.3, 0.1, 0.3])/0.7  # Probabilities for each token

next_idx = np.random.choice(range(vocab_size), p=probs.ravel())
print("Chosen index:", next_idx)


ValueError: probabilities do not sum to 1