# Probablistic retrival model, Fundamental of RAG

In [99]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

In [100]:
# Initialize the tokenizer and the model
model_id = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

In [101]:
model.transformer.wte.weight

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0445, -0.0548,  0.0123,  ...,  0.1044,  0.0978, -0.0695],
        [ 0.1860,  0.0167,  0.0461,  ..., -0.0963,  0.0785, -0.0225],
        [ 0.0514, -0.0277,  0.0499,  ...,  0.0070,  0.1552,  0.1207]],
       requires_grad=True)

In [102]:
print(tokenizer.encode("<|endoftext|>",return_tensors="pt"))
print(tokenizer.decode(range(200)))
print(tokenizer.decode([20755]))

tensor([[50256]])
!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~���������������������������������������������������������������������������������������������� 	

 impacted


In [103]:
print(tokenizer)
print(f"model:{model}")


GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)
model:GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwi

In [104]:
# simplified text generation function
prompt = "Dear boss ..."

def simple_text_generation(prompt,model,tokenizer,max_length=100):
    input_ids =  tokenizer.encode(prompt,return_tensors="pt") # pt = pytorch
    # print(f"prompt:{prompt}\n input_ids: {input_ids}")
    outputs = model.generate(input_ids,max_length=100)
    
    # print(f"generated result: {outputs}")
    
    sentence = tokenizer.decode(outputs[0],skip_special_tokens=True)
    # print(f"outputs: {sentence}")
    return sentence
   

In [105]:
prompt = "Dear boss ..."
text_generated = simple_text_generation(prompt,
                                        model,
                                        tokenizer)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [106]:
print(text_generated)

Dear boss ... I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I'm not going to be able to do this anymore. I


In [107]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
)


## Tokenization

In [108]:
data = sentences = [
    "this is all about tokenization",
    "Tokenization transforms raw text into structured units called tokens, enabling language models to process sentences numerically while preserving linguistic meaning through consistent mapping between text fragments and integer identifiers.",

    "Embedding layers convert token identifiers into dense continuous vectors, allowing neural networks to learn semantic similarity by placing related words closer together in high dimensional vector space during training.",

    "Subword tokenization techniques such as byte pair encoding help models represent rare or unseen words by decomposing them into smaller meaningful units that still receive informative embeddings.",

    "A tokenizer defines vocabulary size and token boundaries, directly influencing memory usage, sequence length, and the quality of embeddings learned by transformer based language models.",

    "Word embeddings are learned parameters that capture semantic relationships, enabling models to infer meaning, analogy, and contextual relevance rather than treating words as isolated symbols.",

    "When text is tokenized, punctuation, whitespace, and special characters are handled explicitly so the resulting token stream remains consistent across different inputs and training environments.",

    "Embedding vectors are typically initialized randomly and gradually optimized through gradient descent so that contextual patterns in language are reflected in their numerical representations.",

    "Tokenization choices affect downstream performance because poorly designed token splits can fragment meaning and make it harder for embedding layers to capture semantic coherence.",

    "In transformer models, each token embedding represents a combination of lexical meaning and learned structure before positional information is added to encode word order.",

    "Context independent embeddings assign one vector per token, while contextual embeddings adjust representations dynamically based on surrounding tokens within a sentence.",

    "Padding tokens are introduced during tokenization to align sequence lengths in a batch, and their embeddings are usually masked to avoid influencing model predictions.",

    "Embedding dimensions control the expressive power of a model, with larger dimensions allowing richer representations at the cost of increased computation and memory usage.",

    "Tokenizers map text to integers deterministically, ensuring reproducibility so the same input sentence always produces identical token sequences across experiments.",

    "Shared embedding spaces enable models to compare tokens mathematically, allowing cosine similarity or dot product operations to reveal semantic closeness between words.",

    "Special tokens such as start of sequence and end of sequence guide models during training by clearly marking sentence boundaries in the tokenized input.",

    "Tokenization errors propagate forward, meaning poorly segmented text can limit the quality of embeddings no matter how powerful the downstream neural architecture is.",

    "Embedding layers act as a lookup table where each row corresponds to a token vector that is continuously refined as the model learns from large text datasets.",

    "Character level tokenization avoids unknown words but increases sequence length dramatically, making embedding learning more computationally expensive for long inputs.",

    "Subword embeddings strike a balance between vocabulary size and semantic granularity, making them effective for multilingual and low resource language modeling tasks.",

    "During inference, tokenized text is passed through frozen embedding layers that transform symbolic input into numerical form suitable for matrix operations.",

    "Embedding similarity allows models to generalize, so words appearing in similar contexts produce related vectors even if they never appear together explicitly.",

    "Tokenization schemes differ across models, meaning embeddings trained with one tokenizer are generally incompatible with models expecting another vocabulary.",

    "Learned embeddings encode both syntactic and semantic information, allowing models to understand grammatical roles as well as conceptual meaning.",

    "Byte level tokenization ensures every possible input can be represented, but often produces longer token sequences requiring careful embedding optimization.",

    "Embedding matrices can be inspected directly in frameworks like PyTorch, revealing how tokens correspond to rows of trainable numerical parameters.",

    "Tokenization converts unstructured text into a discrete representation that neural networks can efficiently batch, embed, and process in parallel.",

    "Pretrained embeddings provide a strong initialization that helps models converge faster by starting from linguistically informed representations.",

    "Token embeddings are shared across all occurrences of a token, allowing consistent meaning to be reinforced across many training examples.",

    "Positional embeddings are added to token embeddings so models can distinguish between identical tokens appearing at different positions in a sequence.",

    "The quality of embeddings depends heavily on data diversity, since richer corpora expose tokens to varied contexts that shape their vector representations.",

    "Tokenization must balance linguistic accuracy with computational efficiency to avoid unnecessary fragmentation of common words.",

    "Embedding vectors live in continuous space, enabling smooth interpolation between meanings rather than rigid categorical distinctions.",

    "Tokenizers handle casing rules differently, meaning lowercasing text can significantly impact embedding reuse and vocabulary size.",

    "In causal language models, token embeddings are optimized to predict the next token, reinforcing contextual relationships through training objectives.",

    "Embedding lookup is one of the first operations in a language model forward pass, transforming integer inputs into floating point tensors.",

    "Subword tokenization helps reduce out of vocabulary issues while allowing embeddings to capture meaningful morphological patterns.",

    "Embedding layers are typically followed by attention mechanisms that refine token representations based on interactions with neighboring tokens.",

    "Tokenization defines how text is segmented, but embeddings determine how those segments are understood numerically by the model.",

    "Training embeddings jointly with the model allows them to adapt to task specific language usage rather than remaining static.",

    "Token embeddings encode prior knowledge learned during pretraining, enabling downstream tasks to benefit from general language understanding.",

    "Whitespace handling during tokenization affects how embeddings represent word boundaries and sentence structure.",

    "Embedding normalization techniques can improve stability by keeping vector magnitudes within reasonable bounds.",

    "Tokenizers must be deterministic so embedding lookup remains consistent across distributed training environments.",

    "Embedding similarity can reveal biases present in training data, as tokens reflecting similar contexts cluster together.",

    "Special tokens receive their own embeddings, allowing models to treat structural markers differently from regular text tokens.",

    "Tokenization errors often appear subtle but can degrade embedding quality in long sequences.",

    "Embedding matrices grow linearly with vocabulary size, making efficient tokenization essential for scaling large models.",

    "Contextual embeddings evolve across transformer layers, refining token meaning as more context is incorporated.",

    "Tokenization is language dependent, so multilingual models rely heavily on shared subword embeddings.",

    "Embedding layers are differentiable components that learn through gradient updates during backpropagation.",

    "Token frequency influences embedding quality, as rare tokens receive fewer updates during training.",

    "Embedding inspection helps researchers understand how models internalize linguistic structure.",

    "Tokenizers define how numbers, symbols, and punctuation are represented before embedding lookup.",

    "Embedding vectors allow models to compute relationships using linear algebra rather than symbolic rules.",

    "Tokenization choices influence sequence length, which directly impacts attention complexity.",

    "Embedding layers compress discrete token identities into dense numerical forms suitable for neural computation.",

    "Poor tokenization can increase sequence length unnecessarily, reducing embedding efficiency.",

    "Embedding sharing between input and output layers reduces parameters and improves generalization.",

    "Tokenization determines the granularity at which meaning is represented in embeddings.",

    "Embedding spaces often capture analogical relationships such as semantic similarity or oppositeness.",

    "Tokenization pipelines must remain consistent between training and inference to preserve embedding alignment.",

    "Embedding learning benefits from large corpora where tokens appear in diverse linguistic contexts.",

    "Tokenizers may split words differently depending on prefixes, suffixes, or frequency statistics.",

    "Embedding vectors are updated incrementally as models learn from prediction errors.",

    "Tokenization is a preprocessing step, but embeddings are learned representations within the model.",

    "Embedding dimensionality reflects a tradeoff between expressiveness and computational cost.",

    "Tokenization errors are difficult to correct after embedding lookup has occurred.",

    "Embedding layers serve as the bridge between symbolic language and numerical computation.",

    "Tokenizers encode language rules implicitly through their vocabulary construction process.",

    "Embedding similarity metrics enable semantic search and clustering applications.",

    "Tokenization defines model input structure, while embeddings define representational meaning.",

    "Embedding matrices can be visualized to analyze semantic clustering of tokens.",

    "Tokenization must handle edge cases like emojis, URLs, and code snippets consistently.",

    "Embedding vectors evolve during training to reflect task specific linguistic patterns.",

    "Tokenization consistency ensures embeddings remain meaningful across different datasets.",

    "Embedding layers translate discrete token indices into continuous feature representations.",

    "Tokenization granularity affects how efficiently embeddings encode meaning.",

    "Embedding quality is tightly coupled with tokenizer design decisions.",

    "Tokenization strategies influence how models generalize to unseen text.",

    "Embedding learning enables neural models to capture language structure without explicit rules.",

    "Tokenization converts language into a format embeddings can transform into meaning.",

    "Embedding layers are foundational components underlying modern natural language processing systems."
]


In [109]:
# all inputs must have the same length
# add a dummy token at the end
# Having the same length => this is called padding

tokenizer.pad_token = tokenizer.eos_token 

tokenized_data = [tokenizer.encode_plus(
    sentence,
    add_special_tokens= True,
    return_tensors="pt",
    padding="max_length",
    max_length=50,
    
) for sentence in data]

In [110]:
tokenized_data[:2]

[{'input_ids': tensor([[ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]])},
 {'input_ids': tensor([[30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 502

In [111]:
input_ids = [item["input_ids"].squeeze() for item in tokenized_data]
attention_masks = [mask["attention_mask"].squeeze() for mask in tokenized_data]
input_ids[:5],attention_masks[:5]

([tensor([ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  tensor([30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699

### Convert the input_ids and attention mask to tensors

In [112]:
inputs_ids_tensor = torch.stack(input_ids)
attention_masks_tensor = torch.stack(attention_masks)
inputs_ids_tensor[:3],attention_masks_tensor[:3]

print(inputs_ids_tensor.shape)
# print(input_ids.shape) # python list doesn't have shape property, that is why we change to torch tensor

torch.Size([83, 50])


In [113]:
padded_input_ids = pad_sequence(inputs_ids_tensor,
             batch_first=True,
             padding_value=tokenizer.eos_token_id)
padded_attention_masks = pad_sequence(attention_masks_tensor,
                                     batch_first=True,
                                     padding_value=0)

In [114]:
padded_input_ids[:2],padded_attention_masks[:2]

(tensor([[ 5661,   318,   477,   546, 11241,  1634, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
         [30642,  1634, 31408,  8246,  2420,   656, 20793,  4991,  1444, 16326,
             11, 15882,  3303,  4981,   284,  1429, 13439,  5470,  1146,   981,
          23934, 29929,  3616,   832,  6414, 16855,  1022,  2420, 21441,   290,
          18253, 42814,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]]),
 tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0],
         [1, 1,

In [115]:
class TextDataset(Dataset):
    def __init__(self,input_ids,attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = input_ids.clone()
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,index):
        return {
            "input_ids":self.input_ids[index],
            "attention_mask":self.attention_masks[index],
            "labels":self.labels[index]
        }


dataset = TextDataset(inputs_ids_tensor,attention_masks_tensor)
    

In [116]:
len(dataset),dataset[2]

(83,
 {'input_ids': tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699,  1978,   287,  1029, 38517, 15879,  2272,
           1141,  3047,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0]),
  'labels': tensor([31567,  6048,   278, 11685, 10385, 11241, 42814,   656, 15715, 12948,
          30104,    11,  5086, 17019,  7686,   284,  2193, 37865, 26789,   416,
          12560,  3519,  2456,  5699,  1978,   287,  1029, 38517, 15879,  2272,
           1141,  3047,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
          50256, 50256, 50256, 50256, 50256, 50256

### Fine tuning the GPT2 model


In [117]:
data_loader = DataLoader(dataset,batch_size=2,shuffle=True)

In [118]:
# data_loader
# for batch in data_loader:
#     print(batch)
#     print("\n"*5)

In [119]:
model.parameters

<bound method Module.parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)>

In [120]:
optimizer = torch.optim.AdamW(model.parameters(),lr=5e-5)

In [121]:
# Set the model to training mode
model.train()

# Training loop
for epoch in range(10):
    for batch in data_loader:
        # Unpacking the input and atttention mask ids
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        # Reset the gradients to zero
        optimizer.zero_grad()
        #forward pass
        outputs = model(input_ids=input_ids,
                       attention_mask=attention_mask,
                       labels=input_ids)
        loss = outputs.loss
        #backward pass
        loss.backward()
        #update the model parameters
        optimizer.step()
    # print the loss for the current epoch to monitor the progress
    print(f"Epoch {epoch+1} -Loss: {loss.item()}")
    
        

        



        

Epoch 1 -Loss: 1.5734491348266602
Epoch 2 -Loss: 1.4260951280593872
Epoch 3 -Loss: 0.9261785745620728
Epoch 4 -Loss: 1.0751062631607056
Epoch 5 -Loss: 0.6304660439491272
Epoch 6 -Loss: 0.7305591106414795
Epoch 7 -Loss: 0.4289112389087677
Epoch 8 -Loss: 0.3338474631309509
Epoch 9 -Loss: 0.1948249191045761
Epoch 10 -Loss: 0.3274318277835846


### Define funciton to generate text

In [137]:
def generate_text(prompt,model,tokenizer,max_length=100):
    inputs = tokenizer.encode_plus(prompt,return_tensors="pt")
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    outputs = model.generate(input_ids,
                             attention_mask=attention_mask,
                             max_length=max_length)
    return tokenizer.decode(outputs[0],skip_special_tokens=True)

prompt = "what is Embedding?"

text_generated = generate_text(prompt,model,tokenizer,max_length=500)
print(f"text_generated: {text_generated}")




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


text_generated: what is embedding? riverside [unused252] distance main 8 breaking sad [unused285] ₱ [unused12]


## Tokenization and Embeddings

In [138]:
# install the faiss-cpu library
!pip install faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [139]:
import faiss
import numpy as np
import torch
from transformers import AutoTokenizer,AutoModel

In [160]:
# Initialize the tokenizer and model for generating embeddings
model_id = "sentence-transformers/paraphrase-MiniLM-L6-V2"
embed_tokenizer = AutoTokenizer.from_pretrained(model_id)
embed_model = AutoModel.from_pretrained(model_id)

In [161]:
documents = [

# ===================== EDUCATION (20) =====================

"Ethiopia’s rapid university expansion after 2010 produced dozens of public institutions, but many campuses faced shortages of qualified faculty, laboratory equipment, and digital libraries, leading to uneven educational quality between flagship universities and newer regional institutions.",

"The introduction of modular curricula in Ethiopian universities aimed to improve practical skills, yet implementation varied widely due to limited industry partnerships and insufficient internship opportunities in smaller regional economies.",

"Secondary education reforms emphasized national exam standardization, but disparities persisted as rural schools struggled with teacher turnover, language-of-instruction transitions, and limited access to preparatory materials.",

"Ethiopia’s push to expand STEM education increased engineering enrollments, while insufficient computing infrastructure and outdated syllabi slowed alignment with modern software and AI industry requirements.",

"The growth of private universities widened access to higher education, but raised concerns regarding accreditation consistency, faculty qualifications, and graduate employability in Ethiopia’s constrained labor market.",

"Language policy in Ethiopian education required students to transition from regional languages to English instruction, creating comprehension gaps that disproportionately affected rural learners during secondary and tertiary education.",

"Teacher training colleges expanded rapidly, yet continuous professional development remained limited, affecting pedagogical quality in overcrowded classrooms across fast-growing urban and peri-urban areas.",

"The COVID-19 school closures accelerated digital learning experiments, revealing significant inequalities in device ownership, electricity access, and household learning environments across Ethiopian regions.",

"Technical and vocational education reforms aimed to address youth unemployment, but societal preference for university degrees reduced enrollment in TVET programs despite labor demand.",

"University research output increased modestly, though limited funding and heavy teaching loads constrained faculty engagement in internationally competitive research activities.",

"Ethiopia’s school feeding programs improved attendance in food-insecure regions, but funding sustainability and supply chain reliability remained ongoing operational challenges.",

"Postgraduate education expanded in Ethiopian universities, yet doctoral supervision capacity lagged behind enrollment growth, affecting completion timelines and research quality.",

"Curriculum decentralization allowed regions to adapt content to local contexts, though uneven capacity resulted in inconsistent learning outcomes nationwide.",

"Digital student information systems were introduced in some universities, but interoperability challenges limited nationwide academic data integration.",

"Gender parity initiatives improved female enrollment, yet retention gaps persisted due to early marriage, household labor expectations, and safety concerns.",

"National education roadmaps emphasized competency-based learning, though assessment practices often remained exam-centered due to institutional inertia.",

"Ethiopia’s boarding school models aimed to serve pastoralist communities, balancing mobility challenges with formal education continuity.",

"The expansion of community schools increased rural access, but infrastructure quality and teacher allocation remained uneven.",

"Graduate unemployment influenced student migration toward perceived marketable fields, sometimes oversaturating specific disciplines.",

"Education financing reforms debated cost-sharing models amid rising enrollment pressures and limited public budgets.",


# ===================== RELIGION / ORTHODOX (20) =====================

"The Ethiopian Orthodox Tewahedo Church remains deeply intertwined with national identity, yet internal administrative disputes increasingly intersected with ethnic and regional political dynamics.",

"Orthodox church education systems preserved Ge’ez literacy traditions, though declining enrollment among youth raised concerns about intergenerational knowledge transmission.",

"Recent disagreements over ecclesiastical jurisdiction reflected broader federal-regional tensions rather than purely theological differences.",

"The church’s vast land holdings influenced urban development negotiations, especially in expanding cities like Addis Ababa and Bahir Dar.",

"Religious festivals continued to structure communal calendars, even as urbanization altered participation patterns and ritual practices.",

"Monasteries in remote regions played roles in environmental conservation through traditional land stewardship practices.",

"Interfaith relations evolved as urban neighborhoods hosted Orthodox, Muslim, and Protestant communities in close proximity.",

"Digital media enabled clergy and lay scholars to disseminate teachings, altering traditional authority channels within the church.",

"The training of priests faced challenges due to economic pressures that diverted youth toward income-generating activities.",

"Disputes over language use in liturgy mirrored broader debates about cultural representation and inclusion.",

"Orthodox charitable organizations expanded social services during humanitarian crises, supplementing limited state capacity.",

"Church music traditions adapted to modern recording technologies while preserving liturgical structures.",

"The role of the Orthodox Church in mediation efforts varied across regional conflicts.",

"Restoration projects of ancient churches relied increasingly on diaspora funding.",

"Urban parish administration struggled with rapid population growth and resource constraints.",

"The church’s calendar influenced agricultural labor cycles in rural communities.",

"Heritage preservation debates emerged around modernization near historic religious sites.",

"Religious education curricula balanced doctrinal instruction with contemporary social issues.",

"The visibility of clergy in public discourse evolved with increased media exposure.",

"Monastic tourism raised sustainability and preservation concerns.",


# ===================== TECHNOLOGY (20) =====================

"Ethiopia’s telecom liberalization introduced competition, reshaping data pricing, network expansion priorities, and mobile financial service adoption beyond major urban centers.",

"Local software developers faced constraints from limited access to international payment systems, affecting participation in global digital marketplaces.",

"Fintech growth expanded mobile payments, though interoperability challenges persisted between platforms.",

"Startup hubs in Addis Ababa supported innovation but struggled with funding continuity.",

"Digital ID rollout aimed to unify service access while raising privacy and infrastructure concerns.",

"Ethiopia’s data center investments targeted government digitization needs.",

"Internet shutdowns disrupted digital businesses and remote education efforts.",

"Agri-tech pilots used satellite data for yield estimation.",

"E-commerce adoption remained constrained by logistics and addressing systems.",

"Open-source communities grew through university-linked tech clubs.",

"Cloud adoption was limited by bandwidth costs.",

"Ride-hailing apps navigated regulatory uncertainty.",

"AI research groups emerged within universities despite compute limitations.",

"Digital health platforms supported appointment scheduling pilots.",

"Payment APIs expanded merchant digitalization.",

"Drone technology was tested for land surveying.",

"Tech policy debates focused on data localization.",

"Digital literacy programs targeted youth employment.",

"Smart meter projects aimed to reduce utility losses.",

"Local language NLP research faced dataset scarcity.",


# ===================== AGRICULTURE (20) =====================

"Ethiopia’s wheat self-sufficiency drive expanded irrigated farming in lowland areas.",

"Smallholder farmers adopted improved seed varieties unevenly.",

"Extension services increasingly used mobile messaging.",

"Coffee traceability reforms enabled specialty exports.",

"Climate variability affected planting calendars.",

"Pastoralist mobility conflicted with fixed land use policies.",

"Post-harvest losses remained a major challenge.",

"Irrigation schemes altered traditional water-sharing norms.",

"Fertilizer distribution reforms affected timing and access.",

"Livestock exports depended on disease control systems.",

"Agro-processing parks aimed to increase value addition.",

"Soil degradation influenced long-term productivity.",

"Rainfall forecasting tools supported adaptive planning.",

"Market access varied due to road infrastructure.",

"Urban demand shaped peri-urban farming.",

"Seed certification improved quality assurance.",

"Cooperative governance affected farmer bargaining power.",

"Mechanization adoption was limited by cost.",

"Crop insurance pilots addressed climate risk.",

"Export crop diversification strategies expanded.",


# ===================== POLITICS (20) =====================

"Ethiopia’s federal system faced governance challenges amid shifting regional power dynamics.",

"Electoral processes were influenced by security conditions.",

"Decentralization policies complicated service delivery coordination.",

"Political party fragmentation affected coalition stability.",

"Administrative boundary disputes impacted local governance.",

"Security sector reforms intersected with regional authority.",

"Media liberalization expanded public discourse.",

"Emergency regulations affected civil liberties debates.",

"Peace negotiations shaped post-conflict transitions.",

"Federal–regional fiscal relations influenced budget allocation.",

"Identity politics shaped voter mobilization strategies.",

"Judicial reform aimed to strengthen independence.",

"Public sector reform targeted efficiency improvements.",

"Foreign policy balanced regional diplomacy priorities.",

"Legislative capacity faced resource constraints.",

"Civil society space evolved under new regulations.",

"Conflict resolution mechanisms varied by region.",

"Urban governance reforms addressed service delivery.",

"Policy continuity challenges followed leadership transitions.",

"Political dialogue platforms aimed to reduce polarization."
]


In [162]:
def embed_text(text,tokenizer,model):
    inputs = tokenizer(text,
                       return_tensors="pt",
                       padding=True,
                       truncation=True
                       
                      )
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state # last context riched tokens
        embeddings = embeddings.mean(dim=1) # pooling token embeddings into single sentence embedding for the seek of retrieval system
    return embeddings

In [163]:
embed_model,embed_tokenizer

(BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 384, padding_idx=0)
     (position_embeddings): Embedding(512, 384)
     (token_type_embeddings): Embedding(2, 384)
     (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0-5): 6 x BertLayer(
         (attention): BertAttention(
           (self): BertSdpaSelfAttention(
             (query): Linear(in_features=384, out_features=384, bias=True)
             (key): Linear(in_features=384, out_features=384, bias=True)
             (value): Linear(in_features=384, out_features=384, bias=True)
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (output): BertSelfOutput(
             (dense): Linear(in_features=384, out_features=384, bias=True)
             (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
             (dropout): Dropout(

In [164]:
# Initialize a list to store the bembeddings
document_embeddings = []

for doc in documents:
    doc_embeddings = embed_text(doc,embed_tokenizer,embed_model)
    document_embeddings.append(doc_embeddings)


document_embeddings[1].shape
 

torch.Size([1, 384])

In [165]:
 document_embeddings = torch.cat(document_embeddings).cpu().numpy()
document_embeddings.shape, document_embeddings[:2]

((100, 384),
 array([[ 2.30880473e-02, -1.61137551e-01, -9.10027847e-02,
          5.19852489e-02,  1.82184979e-01, -3.95213604e-01,
         -7.83974648e-01,  1.36128411e-01,  8.63956138e-02,
          3.58952075e-01,  9.72939562e-03,  2.33186856e-01,
          4.01146524e-02,  1.65327922e-01, -4.39540818e-02,
          5.85592687e-02, -3.67548198e-01, -3.28183115e-01,
         -8.10759813e-02, -2.29089975e-01, -3.55545819e-01,
         -3.25489372e-01, -1.91534236e-01, -2.62745433e-02,
          2.69102871e-01,  1.42443748e-02, -1.06474422e-01,
         -4.77159202e-01,  1.95637405e-01, -3.46150219e-01,
          1.04308277e-01,  9.62997749e-02,  3.46982658e-01,
          2.00115263e-01,  3.26267540e-01,  3.76052469e-01,
          1.30603597e-01, -3.01188529e-01, -7.76710063e-02,
          1.83142051e-02,  1.98836282e-01, -1.43699870e-01,
         -1.14075430e-01, -1.63896292e-01,  2.06622362e-01,
         -2.16120958e-01,  1.85457379e-01, -4.88813967e-02,
         -2.96157867e-01, -

### Build the Retrieval System

In [166]:
document_embeddings.shape[1]

384

In [167]:
 index = faiss.IndexFlatL2(document_embeddings.shape[1])
index.add(document_embeddings)
print(index)

<faiss.swigfaiss_avx512.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x7b5c22464f60> >


In [168]:
faiss.downcast_index(index)
index.d, index.ntotal, index.is_trained,index.metric_type

(384, 100, True, 1)

In [169]:
# Retrieval -> Build a function to retrieve information
def retrieve(query,tokenizer,model,index,documents,top_k=3):
    query_embeddings = embed_text(query,tokenizer,model)
    distances, indices = index.search(query_embeddings,top_k)
    return [documents[i] for i in indices[0]],distances[0]


In [180]:
# Test the retrieval function
query = "ethiopain technology development journey"
retrieved_docs,distances = retrieve(query,embed_tokenizer,embed_model,index,documents)
for d in retrieved_docs:
    print(d)
    print("\n"*5)
print(distances)

Ethiopia’s push to expand STEM education increased engineering enrollments, while insufficient computing infrastructure and outdated syllabi slowed alignment with modern software and AI industry requirements.






AI research groups emerged within universities despite compute limitations.






The COVID-19 school closures accelerated digital learning experiments, revealing significant inequalities in device ownership, electricity access, and household learning environments across Ethiopian regions.






[53.034523 53.730743 54.004635]


### Integrating The Generative System

In [181]:
from transformers import AutoModelForCausalLM,AutoTokenizer

In [182]:
# Initialize the generative tokenizer and model
gen_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gen_model = AutoModelForCausalLM.from_pretrained("gpt2")
gen_tokenizer.pad_token = gen_tokenizer.eos_token

In [183]:
context = " ".join(retrieved_docs)
context

'Ethiopia’s push to expand STEM education increased engineering enrollments, while insufficient computing infrastructure and outdated syllabi slowed alignment with modern software and AI industry requirements. AI research groups emerged within universities despite compute limitations. The COVID-19 school closures accelerated digital learning experiments, revealing significant inequalities in device ownership, electricity access, and household learning environments across Ethiopian regions.'

In [214]:
# Function to generate context riched text
def generate_text(context,query,model,tokenizer):
    input_text =   f"Context: {context} \n Question:{query} \n Answer:"
    inputs = tokenizer(input_text,
                       return_tensors="pt",
                       padding=True,
                      truncation=True)
    inputs_ids = inputs["input_ids"]
    (inputs_ids != tokenizer.pad_token_id)
    attention_masks = inputs["attention_mask"]
    outputs = model.generate(inputs_ids,
                                attention_mask=attention_masks,
                                max_new_tokens=100,
                                 do_sample=True,
                                 temperature=0.9,
                                pad_token_id=gen_tokenizer.eos_token_id)
    outputs.shape,outputs
    
    
    return tokenizer.decode(outputs[0],skip_special_tokens=True)

In [216]:
generated_answer = generate_text(context,query,gen_model,gen_tokenizer)
print(f"generated anser:\n {generated_answer}")

generated anser:
 Context: Ethiopia’s push to expand STEM education increased engineering enrollments, while insufficient computing infrastructure and outdated syllabi slowed alignment with modern software and AI industry requirements. AI research groups emerged within universities despite compute limitations. The COVID-19 school closures accelerated digital learning experiments, revealing significant inequalities in device ownership, electricity access, and household learning environments across Ethiopian regions. 
 Question:ethiopain technology development journey 
 Answer: 

Fossil energy resources (including fossil fuels) that can sustain human civilizations may have limited potential for human enhancement. In some countries, increasing fossil fuels is largely a failure: an unsustainable situation with few alternatives. However, increasing economic growth and development must be based on the principles of sustainable development and the ability to meet the high standard of living n