<a href="https://colab.research.google.com/github/nexageapps/LLM/blob/main/LLM_%7C_Basic_%7C_L1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Author: Karthik Arjun
# LinkedIn: https://www.linkedin.com/in/karthik-arjun-a5b4a258/
# Comment: Beginner level understanding about LLM
# Book Reference: Build a LLM - By Sebastian
# Chapter - #2: Working with text data

#Step-1: Import the library functions
import os #Helps interact with the operating system — like checking if a file exists.
import urllib.request #Allows you to download files and interact with URLs.
import re #Python’s regular expressions module.

#Step-2: Reference
if not os.path.exists("the-verdict.txt"): #File exist??
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path) #This downloads the file from the URL and saves it as the-verdict.txt in your local folder

#Step-3: Action : Create Tokens
    with open("the-verdict.txt", "r", encoding="utf-8") as f:
      raw_text = f.read()

    #open(...) opens the file in read mode ("r").
    #The with statement makes sure the file is automatically closed after you're done using it
    #f is the file object.
    #.read() reads the entire contents of the file into a variable named raw_text.

preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
    #split by using the regular expression include all punchuations
    #.strip() removes leading and trailing whitespace.
    #if item.strip() removes empty strings or strings with just spaces.


#Step-4: Execution #1
print("Total number of character:", len(raw_text)) #Print the length of the total content
print(raw_text[:99]) #Print the first 99 Char
print(preprocessed[:30]) #Print the first 30 tokens from the processed text.
print(len(preprocessed))

#Step-6: Action: Create Token ID - Encoder
all_words =  sorted(set(preprocessed)) #set comman removes duplicate values
vocab_size = len(all_words)
print(vocab_size)

vocab = {token: integer for integer, token in enumerate(all_words)} #create dictionaries
#create a dictionary that maps each unique token (word or punctuation) to a unique integer ID
#enumerate:adds index to the value
for i , item in enumerate(vocab.items()):
  print (item)
  if i >= 50:
    break

#Step-7: Decode the messages using the tokens with other sentenses!

# Sample input sentence
sample_text = "I had come"

# Tokenize the input in the same way
input_tokens = re.split(r'([,.:;?_!"()\']|--|\s)', sample_text)
input_tokens = [item.strip() for item in input_tokens if item.strip()]
print("Input Tokens:", input_tokens)

# Encode using vocab
encoded = [vocab[token] for token in input_tokens if token in vocab]
print("Encoded IDs:", encoded)

# Create reverse mapping
id2token = {idx: tok for tok, idx in vocab.items()}

# Decode
decoded = [id2token[idx] for idx in encoded]
print("Decoded Text:", ' '.join(decoded))

#Step-Last: Questions

#Q1: Total number of character is 20479, then why we have 4690 tokens?
#A1: Characters ≠ Tokens
    #A character is a single symbol: a letter, number, space, punctuation, etc.
    #A token is usually a meaningful chunk, like a word or punctuation mark.
      #Splits on whitespace and punctuation, but keeps those as tokens.
      #Then removes empty tokens and trims spaces.
#----------------------------------------------------------------------------#


Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
4690
1130
('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('H