# Probablistic retrival model, Fundamental of RAG

In [11]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [12]:
# Initialize the tokenizer and the model
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [36]:
print(tokenizer.encode("<|endoftext|>",return_tensors="pt"))
print(tokenizer.decode(range(200)))
print(tokenizer.decode([20755]))

tensor([[50256]])
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~���������������������������������������������������������������������������������������������� 	

 impacted


In [17]:
print(tokenizer)
print(f"model:{model}")


GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)
model:GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwi

In [13]:
# simplified text generation function
prompt = "Dear boss ..."

def simple_text_generation(prompt,model,tokenizer,max_length=100):
    input_ids =  tokenizer.encode(prompt,return_tensors="pt") # pt = pytorch
    # print(f"prompt:{prompt}\n input_ids: {input_ids}")
    outputs = model.generate(input_ids,max_length=100)
    
    # print(f"generated result: {outputs}")
    
    sentence = tokenizer.decode(outputs[0],skip_special_tokens=True)
    # print(f"outputs: {sentence}")
    return sentence
   

In [14]:
prompt = "Dear boss ..."
text_generated = simple_text_generation(prompt,
                                        model,
                                        tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [15]:
print(text_generated)

Dear boss ... I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I


In [16]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)


## Tokenization

In [46]:
data = sentences = [
    "this is all about tokenization",
    "Tokenization transforms raw text into structured units called tokens, enabling language models to process sentences numerically while preserving linguistic meaning through consistent mapping between text fragments and integer identifiers.",

    "Embedding layers convert token identifiers into dense continuous vectors, allowing neural networks to learn semantic similarity by placing related words closer together in high dimensional vector space during training.",

    "Subword tokenization techniques such as byte pair encoding help models represent rare or unseen words by decomposing them into smaller meaningful units that still receive informative embeddings.",

    "A tokenizer defines vocabulary size and token boundaries, directly influencing memory usage, sequence length, and the quality of embeddings learned by transformer based language models.",

    "Word embeddings are learned parameters that capture semantic relationships, enabling models to infer meaning, analogy, and contextual relevance rather than treating words as isolated symbols.",

    "When text is tokenized, punctuation, whitespace, and special characters are handled explicitly so the resulting token stream remains consistent across different inputs and training environments.",

    "Embedding vectors are typically initialized randomly and gradually optimized through gradient descent so that contextual patterns in language are reflected in their numerical representations.",

    "Tokenization choices affect downstream performance because poorly designed token splits can fragment meaning and make it harder for embedding layers to capture semantic coherence.",

    "In transformer models, each token embedding represents a combination of lexical meaning and learned structure before positional information is added to encode word order.",

    "Context independent embeddings assign one vector per token, while contextual embeddings adjust representations dynamically based on surrounding tokens within a sentence.",

    "Padding tokens are introduced during tokenization to align sequence lengths in a batch, and their embeddings are usually masked to avoid influencing model predictions.",

    "Embedding dimensions control the expressive power of a model, with larger dimensions allowing richer representations at the cost of increased computation and memory usage.",

    "Tokenizers map text to integers deterministically, ensuring reproducibility so the same input sentence always produces identical token sequences across experiments.",

    "Shared embedding spaces enable models to compare tokens mathematically, allowing cosine similarity or dot product operations to reveal semantic closeness between words.",

    "Special tokens such as start of sequence and end of sequence guide models during training by clearly marking sentence boundaries in the tokenized input.",

    "Tokenization errors propagate forward, meaning poorly segmented text can limit the quality of embeddings no matter how powerful the downstream neural architecture is.",

    "Embedding layers act as a lookup table where each row corresponds to a token vector that is continuously refined as the model learns from large text datasets.",

    "Character level tokenization avoids unknown words but increases sequence length dramatically, making embedding learning more computationally expensive for long inputs.",

    "Subword embeddings strike a balance between vocabulary size and semantic granularity, making them effective for multilingual and low resource language modeling tasks.",

    "During inference, tokenized text is passed through frozen embedding layers that transform symbolic input into numerical form suitable for matrix operations.",

    "Embedding similarity allows models to generalize, so words appearing in similar contexts produce related vectors even if they never appear together explicitly.",

    "Tokenization schemes differ across models, meaning embeddings trained with one tokenizer are generally incompatible with models expecting another vocabulary.",

    "Learned embeddings encode both syntactic and semantic information, allowing models to understand grammatical roles as well as conceptual meaning.",

    "Byte level tokenization ensures every possible input can be represented, but often produces longer token sequences requiring careful embedding optimization.",

    "Embedding matrices can be inspected directly in frameworks like PyTorch, revealing how tokens correspond to rows of trainable numerical parameters.",

    "Tokenization converts unstructured text into a discrete representation that neural networks can efficiently batch, embed, and process in parallel.",

    "Pretrained embeddings provide a strong initialization that helps models converge faster by starting from linguistically informed representations.",

    "Token embeddings are shared across all occurrences of a token, allowing consistent meaning to be reinforced across many training examples.",

    "Positional embeddings are added to token embeddings so models can distinguish between identical tokens appearing at different positions in a sequence.",

    "The quality of embeddings depends heavily on data diversity, since richer corpora expose tokens to varied contexts that shape their vector representations.",

    "Tokenization must balance linguistic accuracy with computational efficiency to avoid unnecessary fragmentation of common words.",

    "Embedding vectors live in continuous space, enabling smooth interpolation between meanings rather than rigid categorical distinctions.",

    "Tokenizers handle casing rules differently, meaning lowercasing text can significantly impact embedding reuse and vocabulary size.",

    "In causal language models, token embeddings are optimized to predict the next token, reinforcing contextual relationships through training objectives.",

    "Embedding lookup is one of the first operations in a language model forward pass, transforming integer inputs into floating point tensors.",

    "Subword tokenization helps reduce out of vocabulary issues while allowing embeddings to capture meaningful morphological patterns.",

    "Embedding layers are typically followed by attention mechanisms that refine token representations based on interactions with neighboring tokens.",

    "Tokenization defines how text is segmented, but embeddings determine how those segments are understood numerically by the model.",

    "Training embeddings jointly with the model allows them to adapt to task specific language usage rather than remaining static.",

    "Token embeddings encode prior knowledge learned during pretraining, enabling downstream tasks to benefit from general language understanding.",

    "Whitespace handling during tokenization affects how embeddings represent word boundaries and sentence structure.",

    "Embedding normalization techniques can improve stability by keeping vector magnitudes within reasonable bounds.",

    "Tokenizers must be deterministic so embedding lookup remains consistent across distributed training environments.",

    "Embedding similarity can reveal biases present in training data, as tokens reflecting similar contexts cluster together.",

    "Special tokens receive their own embeddings, allowing models to treat structural markers differently from regular text tokens.",

    "Tokenization errors often appear subtle but can degrade embedding quality in long sequences.",

    "Embedding matrices grow linearly with vocabulary size, making efficient tokenization essential for scaling large models.",

    "Contextual embeddings evolve across transformer layers, refining token meaning as more context is incorporated.",

    "Tokenization is language dependent, so multilingual models rely heavily on shared subword embeddings.",

    "Embedding layers are differentiable components that learn through gradient updates during backpropagation.",

    "Token frequency influences embedding quality, as rare tokens receive fewer updates during training.",

    "Embedding inspection helps researchers understand how models internalize linguistic structure.",

    "Tokenizers define how numbers, symbols, and punctuation are represented before embedding lookup.",

    "Embedding vectors allow models to compute relationships using linear algebra rather than symbolic rules.",

    "Tokenization choices influence sequence length, which directly impacts attention complexity.",

    "Embedding layers compress discrete token identities into dense numerical forms suitable for neural computation.",

    "Poor tokenization can increase sequence length unnecessarily, reducing embedding efficiency.",

    "Embedding sharing between input and output layers reduces parameters and improves generalization.",

    "Tokenization determines the granularity at which meaning is represented in embeddings.",

    "Embedding spaces often capture analogical relationships such as semantic similarity or oppositeness.",

    "Tokenization pipelines must remain consistent between training and inference to preserve embedding alignment.",

    "Embedding learning benefits from large corpora where tokens appear in diverse linguistic contexts.",

    "Tokenizers may split words differently depending on prefixes, suffixes, or frequency statistics.",

    "Embedding vectors are updated incrementally as models learn from prediction errors.",

    "Tokenization is a preprocessing step, but embeddings are learned representations within the model.",

    "Embedding dimensionality reflects a tradeoff between expressiveness and computational cost.",

    "Tokenization errors are difficult to correct after embedding lookup has occurred.",

    "Embedding layers serve as the bridge between symbolic language and numerical computation.",

    "Tokenizers encode language rules implicitly through their vocabulary construction process.",

    "Embedding similarity metrics enable semantic search and clustering applications.",

    "Tokenization defines model input structure, while embeddings define representational meaning.",

    "Embedding matrices can be visualized to analyze semantic clustering of tokens.",

    "Tokenization must handle edge cases like emojis, URLs, and code snippets consistently.",

    "Embedding vectors evolve during training to reflect task specific linguistic patterns.",

    "Tokenization consistency ensures embeddings remain meaningful across different datasets.",

    "Embedding layers translate discrete token indices into continuous feature representations.",

    "Tokenization granularity affects how efficiently embeddings encode meaning.",

    "Embedding quality is tightly coupled with tokenizer design decisions.",

    "Tokenization strategies influence how models generalize to unseen text.",

    "Embedding learning enables neural models to capture language structure without explicit rules.",

    "Tokenization converts language into a format embeddings can transform into meaning.",

    "Embedding layers are foundational components underlying modern natural language processing systems."
]


In [47]:
# all inputs must have the same length
# add a dummy token at the end
# Having the same length => this is called padding

tokenizer.pad_token = tokenizer.eos_token 

tokenized_data = [tokenizer.encode_plus(
    sentence,
    add_special_tokens= True,
    return_tensors="pt",
    padding="max_length",
    max_length=50,
    
) for sentence in data]

In [50]:
tokenized_data[:2]

[{'input_ids': tensor([[ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]])},
 {'input_ids': tensor([[30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 502