# Probablistic retrival model, Fundamental of RAG

In [26]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [27]:
# Initialize the tokenizer and the model
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [28]:
model.transformer.wte.weight

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]],
       requires_grad=True)

In [29]:
print(tokenizer.encode("<|endoftext|>",return_tensors="pt"))
print(tokenizer.decode(range(200)))
print(tokenizer.decode([20755]))

tensor([[50256]])
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~���������������������������������������������������������������������������������������������� 	

 impacted


In [30]:
print(tokenizer)
print(f"model:{model}")


GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)
model:GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwi

In [31]:
# simplified text generation function
prompt = "Dear boss ..."

def simple_text_generation(prompt,model,tokenizer,max_length=100):
    input_ids =  tokenizer.encode(prompt,return_tensors="pt") # pt = pytorch
    # print(f"prompt:{prompt}\n input_ids: {input_ids}")
    outputs = model.generate(input_ids,max_length=100)
    
    # print(f"generated result: {outputs}")
    
    sentence = tokenizer.decode(outputs[0],skip_special_tokens=True)
    # print(f"outputs: {sentence}")
    return sentence
   

In [32]:
prompt = "Dear boss ..."
text_generated = simple_text_generation(prompt,
                                        model,
                                        tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [33]:
print(text_generated)

Dear boss ... I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I


In [34]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)


## Tokenization

In [35]:
data = sentences = [
    "this is all about tokenization",
    "Tokenization transforms raw text into structured units called tokens, enabling language models to process sentences numerically while preserving linguistic meaning through consistent mapping between text fragments and integer identifiers.",

    "Embedding layers convert token identifiers into dense continuous vectors, allowing neural networks to learn semantic similarity by placing related words closer together in high dimensional vector space during training.",

    "Subword tokenization techniques such as byte pair encoding help models represent rare or unseen words by decomposing them into smaller meaningful units that still receive informative embeddings.",

    "A tokenizer defines vocabulary size and token boundaries, directly influencing memory usage, sequence length, and the quality of embeddings learned by transformer based language models.",

    "Word embeddings are learned parameters that capture semantic relationships, enabling models to infer meaning, analogy, and contextual relevance rather than treating words as isolated symbols.",

    "When text is tokenized, punctuation, whitespace, and special characters are handled explicitly so the resulting token stream remains consistent across different inputs and training environments.",

    "Embedding vectors are typically initialized randomly and gradually optimized through gradient descent so that contextual patterns in language are reflected in their numerical representations.",

    "Tokenization choices affect downstream performance because poorly designed token splits can fragment meaning and make it harder for embedding layers to capture semantic coherence.",

    "In transformer models, each token embedding represents a combination of lexical meaning and learned structure before positional information is added to encode word order.",

    "Context independent embeddings assign one vector per token, while contextual embeddings adjust representations dynamically based on surrounding tokens within a sentence.",

    "Padding tokens are introduced during tokenization to align sequence lengths in a batch, and their embeddings are usually masked to avoid influencing model predictions.",

    "Embedding dimensions control the expressive power of a model, with larger dimensions allowing richer representations at the cost of increased computation and memory usage.",

    "Tokenizers map text to integers deterministically, ensuring reproducibility so the same input sentence always produces identical token sequences across experiments.",

    "Shared embedding spaces enable models to compare tokens mathematically, allowing cosine similarity or dot product operations to reveal semantic closeness between words.",

    "Special tokens such as start of sequence and end of sequence guide models during training by clearly marking sentence boundaries in the tokenized input.",

    "Tokenization errors propagate forward, meaning poorly segmented text can limit the quality of embeddings no matter how powerful the downstream neural architecture is.",

    "Embedding layers act as a lookup table where each row corresponds to a token vector that is continuously refined as the model learns from large text datasets.",

    "Character level tokenization avoids unknown words but increases sequence length dramatically, making embedding learning more computationally expensive for long inputs.",

    "Subword embeddings strike a balance between vocabulary size and semantic granularity, making them effective for multilingual and low resource language modeling tasks.",

    "During inference, tokenized text is passed through frozen embedding layers that transform symbolic input into numerical form suitable for matrix operations.",

    "Embedding similarity allows models to generalize, so words appearing in similar contexts produce related vectors even if they never appear together explicitly.",

    "Tokenization schemes differ across models, meaning embeddings trained with one tokenizer are generally incompatible with models expecting another vocabulary.",

    "Learned embeddings encode both syntactic and semantic information, allowing models to understand grammatical roles as well as conceptual meaning.",

    "Byte level tokenization ensures every possible input can be represented, but often produces longer token sequences requiring careful embedding optimization.",

    "Embedding matrices can be inspected directly in frameworks like PyTorch, revealing how tokens correspond to rows of trainable numerical parameters.",

    "Tokenization converts unstructured text into a discrete representation that neural networks can efficiently batch, embed, and process in parallel.",

    "Pretrained embeddings provide a strong initialization that helps models converge faster by starting from linguistically informed representations.",

    "Token embeddings are shared across all occurrences of a token, allowing consistent meaning to be reinforced across many training examples.",

    "Positional embeddings are added to token embeddings so models can distinguish between identical tokens appearing at different positions in a sequence.",

    "The quality of embeddings depends heavily on data diversity, since richer corpora expose tokens to varied contexts that shape their vector representations.",

    "Tokenization must balance linguistic accuracy with computational efficiency to avoid unnecessary fragmentation of common words.",

    "Embedding vectors live in continuous space, enabling smooth interpolation between meanings rather than rigid categorical distinctions.",

    "Tokenizers handle casing rules differently, meaning lowercasing text can significantly impact embedding reuse and vocabulary size.",

    "In causal language models, token embeddings are optimized to predict the next token, reinforcing contextual relationships through training objectives.",

    "Embedding lookup is one of the first operations in a language model forward pass, transforming integer inputs into floating point tensors.",

    "Subword tokenization helps reduce out of vocabulary issues while allowing embeddings to capture meaningful morphological patterns.",

    "Embedding layers are typically followed by attention mechanisms that refine token representations based on interactions with neighboring tokens.",

    "Tokenization defines how text is segmented, but embeddings determine how those segments are understood numerically by the model.",

    "Training embeddings jointly with the model allows them to adapt to task specific language usage rather than remaining static.",

    "Token embeddings encode prior knowledge learned during pretraining, enabling downstream tasks to benefit from general language understanding.",

    "Whitespace handling during tokenization affects how embeddings represent word boundaries and sentence structure.",

    "Embedding normalization techniques can improve stability by keeping vector magnitudes within reasonable bounds.",

    "Tokenizers must be deterministic so embedding lookup remains consistent across distributed training environments.",

    "Embedding similarity can reveal biases present in training data, as tokens reflecting similar contexts cluster together.",

    "Special tokens receive their own embeddings, allowing models to treat structural markers differently from regular text tokens.",

    "Tokenization errors often appear subtle but can degrade embedding quality in long sequences.",

    "Embedding matrices grow linearly with vocabulary size, making efficient tokenization essential for scaling large models.",

    "Contextual embeddings evolve across transformer layers, refining token meaning as more context is incorporated.",

    "Tokenization is language dependent, so multilingual models rely heavily on shared subword embeddings.",

    "Embedding layers are differentiable components that learn through gradient updates during backpropagation.",

    "Token frequency influences embedding quality, as rare tokens receive fewer updates during training.",

    "Embedding inspection helps researchers understand how models internalize linguistic structure.",

    "Tokenizers define how numbers, symbols, and punctuation are represented before embedding lookup.",

    "Embedding vectors allow models to compute relationships using linear algebra rather than symbolic rules.",

    "Tokenization choices influence sequence length, which directly impacts attention complexity.",

    "Embedding layers compress discrete token identities into dense numerical forms suitable for neural computation.",

    "Poor tokenization can increase sequence length unnecessarily, reducing embedding efficiency.",

    "Embedding sharing between input and output layers reduces parameters and improves generalization.",

    "Tokenization determines the granularity at which meaning is represented in embeddings.",

    "Embedding spaces often capture analogical relationships such as semantic similarity or oppositeness.",

    "Tokenization pipelines must remain consistent between training and inference to preserve embedding alignment.",

    "Embedding learning benefits from large corpora where tokens appear in diverse linguistic contexts.",

    "Tokenizers may split words differently depending on prefixes, suffixes, or frequency statistics.",

    "Embedding vectors are updated incrementally as models learn from prediction errors.",

    "Tokenization is a preprocessing step, but embeddings are learned representations within the model.",

    "Embedding dimensionality reflects a tradeoff between expressiveness and computational cost.",

    "Tokenization errors are difficult to correct after embedding lookup has occurred.",

    "Embedding layers serve as the bridge between symbolic language and numerical computation.",

    "Tokenizers encode language rules implicitly through their vocabulary construction process.",

    "Embedding similarity metrics enable semantic search and clustering applications.",

    "Tokenization defines model input structure, while embeddings define representational meaning.",

    "Embedding matrices can be visualized to analyze semantic clustering of tokens.",

    "Tokenization must handle edge cases like emojis, URLs, and code snippets consistently.",

    "Embedding vectors evolve during training to reflect task specific linguistic patterns.",

    "Tokenization consistency ensures embeddings remain meaningful across different datasets.",

    "Embedding layers translate discrete token indices into continuous feature representations.",

    "Tokenization granularity affects how efficiently embeddings encode meaning.",

    "Embedding quality is tightly coupled with tokenizer design decisions.",

    "Tokenization strategies influence how models generalize to unseen text.",

    "Embedding learning enables neural models to capture language structure without explicit rules.",

    "Tokenization converts language into a format embeddings can transform into meaning.",

    "Embedding layers are foundational components underlying modern natural language processing systems."
]


In [36]:
# all inputs must have the same length
# add a dummy token at the end
# Having the same length => this is called padding

tokenizer.pad_token = tokenizer.eos_token 

tokenized_data = [tokenizer.encode_plus(
    sentence,
    add_special_tokens= True,
    return_tensors="pt",
    padding="max_length",
    max_length=50,
    
) for sentence in data]

In [37]:
tokenized_data[:2]

[{'input_ids': tensor([[ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]])},
 {'input_ids': tensor([[30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 502

In [38]:
input_ids = [item["input_ids"].squeeze() for item in tokenized_data]
attention_masks = [mask["attention_mask"].squeeze() for mask in tokenized_data]
input_ids[:5],attention_masks[:5]

([tensor([ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  tensor([30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699

### Convert the input_ids and attention mask to tensors

In [39]:
inputs_ids_tensor = torch.stack(input_ids)
attention_masks_tensor = torch.stack(attention_masks)
inputs_ids_tensor[:3],attention_masks_tensor[:3]

print(inputs_ids_tensor.shape)
# print(input_ids.shape) # python list doesn't have shape property, that is why we change to torch tensor

torch.Size([83, 50])


In [40]:
padded_input_ids = pad_sequence(inputs_ids_tensor,
             batch_first=True,
             padding_value=tokenizer.eos_token_id)
padded_attention_masks = pad_sequence(attention_masks_tensor,
                                     batch_first=True,
                                     padding_value=0)

In [41]:
padded_input_ids[:2],padded_attention_masks[:2]

(tensor([[ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
         [30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]),
 tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1,

In [42]:
class TextDataset(Dataset):
    def __init__(self,input_ids,attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = input_ids.clone()
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,index):
        return {
            "input_ids":self.input_ids[index],
            "attention_mask":self.attention_masks[index],
            "labels":self.labels[index]
        }


dataset = TextDataset(inputs_ids_tensor,attention_masks_tensor)
    

In [43]:
len(dataset),dataset[2]

(83,
 {'input_ids': tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699,  1978,   287,  1029, 38517, 15879,  2272,
           1141,  3047,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]),
  'labels': tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699,  1978,   287,  1029, 38517, 15879,  2272,
           1141,  3047,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256

### Fine tuning the GPT2 model


In [44]:
data_loader = DataLoader(dataset,batch_size=2,shuffle=True)

In [45]:
# data_loader
# for batch in data_loader:
#     print(batch)
#     print("\n"*5)

In [46]:
model.parameters

<bound method Module.parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>

In [47]:
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-5)

In [48]:
# Set the model to training mode
model.train()

# Training loop
for epoch in range(10):
    for batch in data_loader:
        # Unpacking the input and atttention mask ids
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        # Reset the gradients to zero
        optimizer.zero_grad()
        #forward pass
        outputs = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       labels=input_ids)
        loss = outputs.loss
        #backward pass
        loss.backward()
        #update the model parameters
        optimizer.step()
    # print the loss for the current epoch to monitor the progress
    print(f"Epoch {epoch+1} -Loss: {loss.item()}")
    
        

        



        

Epoch 1 -Loss: 1.399208903312683
Epoch 2 -Loss: 1.386404275894165
Epoch 3 -Loss: 1.8037998676300049
Epoch 4 -Loss: 1.418811559677124
Epoch 5 -Loss: 0.7961050271987915
Epoch 6 -Loss: 0.4337821900844574
Epoch 7 -Loss: 0.6178490519523621
Epoch 8 -Loss: 0.1530536413192749
Epoch 9 -Loss: 0.21940605342388153
Epoch 10 -Loss: 0.21999230980873108


### Define funciton to generate text

In [49]:
def generate_text(prompt,model,tokenizer,max_length=100):
    inputs = tokenizer.encode_plus(prompt,return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    outputs = model.generate(input_ids,
                             attention_mask=attention_mask,
                             max_length=max_length)
    return tokenizer.decode(outputs[0],skip_special_tokens=True)

prompt = "what is Embedding?"

text_generated = generate_text(prompt,model,tokenizer,max_length=500)
print(f"text_generated: {text_generated}")




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


text_generated: what is Embedding?


## Tokenization and Embeddings

In [50]:
# install the faiss-cpu library
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m78.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [51]:
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer,AutoModel

In [52]:
# Initialize the tokenizer and model for generating embeddings
model_id = "sentence-transformers/paraphrase-MiniLM-L6-V2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
gen_model = AutoModel.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [54]:
documents = [
    "Ethiopia’s telecom liberalization after 2021 introduced Safaricom Ethiopia alongside Ethio Telecom, reshaping mobile connectivity, digital payments, tower sharing, and SIM registration practices in secondary cities and border towns that previously relied on unreliable single-operator coverage.",

    "The Grand Ethiopian Renaissance Dam progressed through multiple coordinated filling and power-generation stages between 2020 and 2024, influencing internal grid stability, seasonal energy planning, and cross-border electricity exports rather than remaining only a geopolitical dispute symbol.",

    "Addis Ababa’s Sheger River project combined flood mitigation, riverbank rehabilitation, pedestrian corridors, and informal settlement relocation, changing real estate demand and urban mobility patterns in western neighborhoods long excluded from earlier master plan priorities.",

    "Ethiopia’s specialty coffee reform allowed cooperatives in Sidama, Guji, and Yirgacheffe to export directly using digital traceability systems, preserving micro-lot identities and farmer premiums that were previously diluted through centralized auction blending under the ECX framework.",

    "The Homegrown Economic Reform II agenda focused on forex market adjustments, partial subsidy removals, and state-owned enterprise restructuring, creating short-term inflation pressures while aiming to attract foreign investment beyond traditional infrastructure-heavy public spending models.",

    "The expansion of Addis Ababa’s bus rapid transit corridors altered daily commuting patterns for low-income residents, reducing dependence on informal minibuses and reshaping peak-hour congestion dynamics along routes connecting peripheral condominium housing developments.",

    "Ethiopia’s national digital ID initiative, Fayda, began phased rollouts to support financial inclusion, SIM registration, and public service access, introducing biometric verification challenges in rural kebeles with limited connectivity and electricity reliability.",

    "The conflict-driven disruptions in northern Ethiopia significantly affected agricultural cycles, seed distribution, and local markets, producing uneven recovery patterns across zones depending on road access, humanitarian logistics, and regional administrative coordination.",

    "Ethiopia’s leather and footwear sector faced export volatility due to changing global demand, domestic power interruptions, and regulatory shifts that affected foreign-owned factories operating in industrial parks such as Hawassa and Bole Lemi.",

    "Urban condominium housing programs in Addis Ababa reshaped household asset ownership and rental markets, but also introduced challenges around maintenance funding, vertical community governance, and distance from employment centers for lower-income beneficiaries.",

    "Ethiopia’s renewable energy strategy increasingly emphasized wind and geothermal projects alongside hydropower, particularly in Rift Valley sites, to diversify generation sources and reduce vulnerability to seasonal rainfall variability.",

    "The Addis–Djibouti electric railway struggled with operational sustainability, facing power supply inconsistencies, foreign exchange shortages for spare parts, and underutilized freight capacity despite its strategic importance for Ethiopia’s import–export corridor.",

    "Ethiopia’s digital startup ecosystem expanded modestly after 2020, driven by fintech, logistics, and agri-tech ventures, yet constrained by limited venture capital availability, regulatory uncertainty, and difficulties integrating with legacy state-dominated systems.",

    "The reform of Ethiopia’s fuel pricing system introduced gradual cost pass-through mechanisms, affecting transportation fares, food prices, and household budgets, while attempting to reduce fiscal burdens previously absorbed through government subsidies.",

    "Regional state boundary disputes in Ethiopia increasingly intersected with administrative decentralization policies, complicating service delivery, voter registration, and local security coordination during periods of political transition.",

    "The growth of mobile money platforms such as Telebirr accelerated digital transactions among informal traders, public transport operators, and rural merchants, gradually reducing cash dependency despite intermittent network outages.",

    "Ethiopia’s higher education expansion produced a growing number of computer science and engineering graduates, yet many faced skill mismatches due to limited industry collaboration and uneven access to modern computing infrastructure.",

    "The commercialization of agriculture corridors emphasized wheat self-sufficiency initiatives, promoting mechanization and irrigation in lowland areas while creating tensions over land tenure, pastoralist mobility, and water access.",

    "Ethiopia’s urban food distribution relied heavily on informal markets, where price volatility reflected transport costs, fuel prices, and seasonal supply disruptions rather than centralized price-setting mechanisms.",

    "The introduction of electronic tax registers aimed to broaden Ethiopia’s tax base, but implementation challenges emerged among small enterprises unfamiliar with digital accounting systems and facing inconsistent electricity supply.",

    "Addis Ababa’s ride-hailing market evolved through local platforms and global competitors, navigating regulatory ambiguity, driver classification debates, and fare affordability constraints for middle- and low-income users.",

    "Ethiopia’s aviation sector remained regionally dominant through Ethiopian Airlines, whose cargo operations expanded significantly during global supply chain disruptions, offsetting declines in passenger travel demand.",

    "The restructuring of state-owned banks prioritized balance sheet cleanup and digital service expansion, yet credit access for small businesses remained constrained by collateral requirements and risk-averse lending practices.",

    "Climate variability increasingly affected Ethiopia’s pastoralist communities, forcing adaptive migration patterns and straining traditional conflict resolution mechanisms over grazing land and water resources.",

    "The proliferation of satellite television and online media altered political discourse in Ethiopia, creating fragmented information environments that challenged traditional state-controlled broadcasting narratives.",

    "Ethiopia’s industrial park strategy aimed to attract export-oriented manufacturing, but retention of foreign firms depended on consistent utilities, customs efficiency, and predictable labor relations frameworks.",

    "The rapid spread of smartphones among Ethiopian youth accelerated social media usage, influencing language trends, music distribution, and informal online commerce despite periodic internet disruptions.",

    "Ethiopia’s road infrastructure expansion improved interregional connectivity, yet maintenance backlogs and axle-load enforcement issues limited long-term transport efficiency gains.",

    "Public health system reforms emphasized primary care expansion, but uneven staffing and supply chain constraints continued to affect service quality in remote woredas.",

    "Ethiopia’s traditional coffee ceremonies persisted as social institutions, even as urban lifestyles shortened preparation times and adapted rituals to apartment living spaces.",

    "The push for wheat irrigation in dry regions highlighted tensions between national food security goals and local environmental sustainability considerations.",

    "Ethiopia’s music industry increasingly leveraged digital streaming platforms, reducing dependence on physical media while navigating monetization challenges due to limited local payment integration.",

    "Cross-border trade with neighboring countries relied heavily on informal networks, shaped by currency shortages, security conditions, and fluctuating customs enforcement practices.",

    "Ethiopia’s urban waste management initiatives introduced pilot recycling programs, yet struggled with public awareness, informal waste pickers’ integration, and landfill capacity constraints.",

    "The adoption of electric cooking initiatives aimed to reduce biomass dependency, but uptake was slowed by appliance costs and unreliable household power connections.",

    "Ethiopia’s microfinance institutions expanded digital loan disbursement, balancing financial inclusion goals against rising default risks during economic shocks.",

    "The growth of private universities altered higher education access, but raised concerns about instructional quality and graduate employability alignment.",

    "Ethiopia’s textile value chain remained vulnerable to global cotton price fluctuations and shipping delays affecting export timelines.",

    "Urban youth unemployment contributed to the rise of informal gig work, including delivery services and freelance digital tasks.",

    "Ethiopia’s public procurement digitization aimed to reduce corruption risks, but adoption varied across ministries and regional bureaus.",

    "The expansion of solar mini-grids supported rural electrification, though long-term maintenance financing remained uncertain.",

    "Ethiopia’s linguistic diversity continued to shape regional media consumption and education policy implementation.",

    "Addis Ababa’s real estate boom created affordability gaps between formal housing and informal settlements.",

    "The normalization of remote work remained limited by bandwidth costs and power reliability.",

    "Ethiopia’s freight logistics faced container shortages during global trade disruptions.",

    "Agricultural extension services increasingly used mobile messaging for farmer outreach.",

    "The integration of refugees into local economies varied by region and policy support.",

    "Urban flooding events highlighted drainage infrastructure weaknesses in fast-growing neighborhoods.",

    "Ethiopia’s artisanal mining sector contributed to local incomes while posing environmental risks.",

    "Digital learning platforms expanded during school closures, exposing access inequalities.",

    "The expansion of expressway toll roads altered long-distance transport economics.",

    "Ethiopia’s seed certification reforms aimed to improve crop yields.",

    "Public–private partnerships gained attention in infrastructure financing debates.",

    "The use of drones for agricultural monitoring emerged in pilot programs.",

    "Ethiopia’s creative economy policy discussions emphasized cultural exports.",

    "Urban water supply projects struggled with non-revenue water losses.",

    "The rise of local podcasting reflected youth-led media experimentation.",

    "Ethiopia’s climate adaptation planning integrated community-based early warning systems.",

    "The retail sector saw gradual adoption of point-of-sale technologies.",

    "Cross-regional labor migration influenced urban housing demand.",

    "Ethiopia’s judicial digitization pilots aimed to reduce case backlogs.",

    "Public transport fare adjustments reflected inflationary pressures.",

    "The expansion of technical vocational training targeted manufacturing skills gaps.",

    "Ethiopia’s horticulture exports relied on cold-chain logistics improvements.",

    "Urban air quality concerns grew alongside vehicle imports.",

    "Ethiopia’s e-commerce sector faced last-mile delivery challenges.",

    "The integration of traditional dispute resolution into formal systems persisted.",

    "Renewable energy financing increasingly involved blended finance models.",

    "Ethiopia’s census delays affected planning accuracy.",

    "The expansion of call centers created new urban employment niches.",

    "Digital mapping projects improved disaster response coordination.",

    "Ethiopia’s film industry experimented with international co-productions.",

    "Urban green spaces influenced neighborhood social interactions.",

    "The rise of online education influencers shaped student aspirations.",

    "Ethiopia’s rail freight pricing affected industrial competitiveness.",

    "Public sector performance contracts aimed to improve accountability.",

    "The growth of coworking spaces reflected changing work culture.",

    "Ethiopia’s border trade posts integrated digital customs systems.",

    "Agricultural insurance pilots addressed climate risk exposure.",

    "Urban noise pollution became a policy discussion topic.",

    "Ethiopia’s diaspora remittances shifted toward digital channels.",

    "The expansion of data centers supported emerging cloud services.",

    "Public libraries experimented with digital resource access.",

    "Ethiopia’s urban cycling initiatives remained limited but symbolic.",

    "The coordination of humanitarian logistics relied on shared data platforms.",

    "Ethiopia’s innovation hubs fostered early-stage tech experimentation.",

    "Urban planning debates increasingly included climate resilience metrics."
]


In [56]:
def embed_text(text,tokenizer,model):
    inputs = tokenizer(text,
                       return_tensors="pt",
                       padding=True,
                       truncation=True
                       
                      )
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state # last context riched tokens
        embeddings = embeddings.mean(dim=1) # pooling token embeddings into single sentence embedding for the seek of retrieval system
    return embeddings

In [62]:
gen_model,tokenizer

(BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 384, padding_idx=0)
     (position_embeddings): Embedding(512, 384)
     (token_type_embeddings): Embedding(2, 384)
     (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0-5): 6 x BertLayer(
         (attention): BertAttention(
           (self): BertSdpaSelfAttention(
             (query): Linear(in_features=384, out_features=384, bias=True)
             (key): Linear(in_features=384, out_features=384, bias=True)
             (value): Linear(in_features=384, out_features=384, bias=True)
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (output): BertSelfOutput(
             (dense): Linear(in_features=384, out_features=384, bias=True)
             (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
             (dropout): Dropout(

In [73]:
# Initialize a list to store the bembeddings
document_embeddings = []

for doc in documents:
    doc_embeddings = embed_text(doc,tokenizer,gen_model)
    document_embeddings.append(doc_embeddings)


document_embeddings[1].shape
 

torch.Size([1, 384])

In [74]:
 document_embeddings = torch.cat(document_embeddings).cpu().numpy()
document_embeddings.shape, document_embeddings[:2]

((87, 384),
 array([[-1.60419613e-01, -4.24365066e-02, -1.68803677e-01,
         -2.31677130e-01,  1.48904890e-01, -9.44776237e-02,
         -2.03818262e-01, -6.98364573e-03, -9.51517522e-02,
          3.84154767e-01,  2.14787990e-01,  1.89004317e-01,
         -1.86711699e-01, -2.05783024e-02,  1.38443440e-01,
          2.30668366e-01, -2.14105502e-01, -3.23954791e-01,
         -1.45259991e-01, -5.45860529e-01, -1.52238719e-02,
         -4.24987942e-01, -2.30765656e-01,  1.05395913e-02,
          7.43249655e-02, -1.53370261e-01, -8.42300802e-03,
         -1.65376291e-01,  3.62123609e-01,  1.69244617e-01,
          2.39755839e-01,  4.90767688e-01,  4.46761921e-02,
          8.12464133e-02,  2.94345301e-02, -1.63322419e-01,
         -1.32990807e-01,  1.00306027e-01, -1.07720688e-01,
         -3.49458829e-02,  4.15672570e-01, -2.36408375e-02,
         -1.25645742e-01, -2.89088607e-01,  2.71396697e-01,
          1.29683524e-01,  1.49547711e-01,  3.80161345e-01,
         -2.33667120e-01, -3

### Build the Retrieval System

In [79]:
document_embeddings.shape[1]

384

In [80]:
 index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)
print(index)

In [86]:
faiss.downcast_index(index)
index.d, index.ntotal, index.is_trained,index.metric_type

(384, 87, True, 1)

In [90]:
# Retrieval -> Build a function to retrieve information
def retrieve(query,tokenizer,model,index,documents,top_k=3):
    query_embeddings = embed_text(query,tokenizer,model)
    distances, indices = index.search(query_embeddings,top_k)
    return [documents[i] for i in indices[0]],distances[0]


In [98]:
# Test the retrieval function
query = "what is news about ethiotelecom"
retrieved_docs,distances = retrieve(query,tokenizer,gen_model,index,documents)
for d in retrieved_docs:
    print(d)
    print("\n"*5)
print(distances)

Ethiopia’s telecom liberalization after 2021 introduced Safaricom Ethiopia alongside Ethio Telecom, reshaping mobile connectivity, digital payments, tower sharing, and SIM registration practices in secondary cities and border towns that previously relied on unreliable single-operator coverage.






Addis Ababa’s ride-hailing market evolved through local platforms and global competitors, navigating regulatory ambiguity, driver classification debates, and fare affordability constraints for middle- and low-income users.






The rapid spread of smartphones among Ethiopian youth accelerated social media usage, influencing language trends, music distribution, and informal online commerce despite periodic internet disruptions.






[50.294266 52.196934 53.72694 ]
