In [52]:
question= "What is the biggest animal on earth"

document= "The most huge mammal in the world is elephant "

### A function that calculates the token for a given word

In [28]:
import tiktoken

def num_tokens_from_string (string:str,encoding_name:str)->int:
    '''Takes a token -> encodes it-> counts the number of tokens'''
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    print(encoding.encode(string))
    return num_tokens

tokens = num_tokens_from_string(question,'cl100k_base')


[3923, 374, 279, 8706, 10065, 389, 9578]


In [53]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")


query_result = model.encode([question])
document_result = model.encode([document])

In [55]:
import numpy  as np

def cosine_similarity(vect1,vect2):
    vect1 = vect1.flatten()  # convert (1, 384) → (384,)
    vect2 = vect2.flatten()
    dot_product = np.dot(vect1.flatten(),vect2)
    norm1 = np.linalg.norm(vect1)
    norm2 = np.linalg.norm(vect2)
    similarity = dot_product /  (norm1*norm2)
    return similarity

similarity_score = cosine_similarity(query_result, document_result)
print("Cosine similarity:", similarity_score)

Cosine similarity: 0.7984286


## build a text splitter for pdf

In [5]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("hungary_immigration.pdf")
docs = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size =1000,
    chunk_overlap=200,
    length_function= len
)

splits = text_splitter.split_documents(docs)

## Building vectorstore and retriever

In [None]:
from langchain_community.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma.from_documents(documents= splits, embedding=embedding)
retriever = vectorstore.as_retriever(search_kwargs ={"k":2})

## Setting up the  DeepSeek LLM

In [7]:
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv("DEEP_SEEK_API_KEY")
if not API_KEY:
    raise ValueError("DEEP_SEEK_API_KEY not found in environment variables")

# Use OpenAI-compatible client directly
llm = ChatOpenAI(
    api_key=API_KEY,
    base_url="https://api.deepseek.com/v1",
    model="deepseek-chat",
    temperature=0
)

## Generating multiple sub prompts from simple prompt

In [8]:
from langchain.prompts import ChatPromptTemplate

template= """
You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}
"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser

generate_queries = (
    prompt_perspectives
    |llm
    |StrOutputParser()
    |(lambda x:x.split("\n"))
)

In [None]:
from langchain.load import dumps, loads

def get_unique_union(documents:list[list]):
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
question = "What should I do if my residence permit expires ?"
retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question":question})



[Document(metadata={'moddate': '2024-11-05T12:57:00+00:00', 'creationdate': '2024-11-05T12:57:00+00:00', 'producer': 'Skia/PDF m130', 'source': 'hungary_immigration.pdf', 'title': 'XC of 2023. law - National Legal Repository', 'total_pages': 97, 'page': 73, 'page_label': '74', 'creator': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'}, page_content='issued (extended) residence permit or mobility in the case of a certificate, it is processed until the end of the fifth calendar year\nfollowing its expiration or revocation.\n(4) The data specified in point n) of subsection (1) until the final and enforceable decision made by the immigration authority on\nthe application for a residence permit, or in points a) and b) of subsection (1) of Section 70 , and d) -f) is processed until the\nhumanitarian residence permit is issued, after which it is deleted immediately.\n(5) The immigration police register is a public register wit

  return [loads(doc) for doc in unique_docs]


## Notes

`retriever.map` applies the retriever to multiple inputs at once, rather than a single query.

What `ChatPromptTemplate.fromTemplate` does

ChatPromptTemplate is a special object in LangChain for managing prompts for LLMs.

fromTemplate(template_str) takes a raw string template and converts it into a structured prompt object.


Summary

Input: single user question

Step 1: Generate multiple sub-questions using LLM

Step 2: Retrieve documents for each sub-question → list of lists

Step 3: Flatten and deduplicate documents using dumps/loads

Output: unique list of relevant documents

## Building the rag chain

### Creating the tamplate

In [10]:
from langchain_core.prompts import PromptTemplate

# Define the RAC prompt
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
Use the following context to answer the question accurately.

Context:
{context}

Question:
{question}

Answer:
"""
)


### Building the RAG chain

In [11]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retrieval_chain, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Invoke
question = " I want to learn about family reunification"
result = rag_chain.invoke(question)
print(result)

Based on the provided context from the Hungarian immigration law, here is the information regarding family reunification:

**Purpose:** A residence permit can be issued for the purpose of ensuring family coexistence (family reunification).

**Eligible Persons:** A third-country national may receive this permit to join:
*   a) a person with a residence permit,
*   b) a person with an immigration, settlement, temporary settlement, national settlement or EC settlement permit.

**Application Process for Family Members:**
The context also specifies that a family member of such a "family reunifier" is eligible to apply for a residence permit via a combined application procedure, provided they meet certain conditions. Specifically, they must have been legally residing in Hungary with a valid residence permit issued for family coexistence for **at least one year** prior to the application, and the family reunifier must not be employed without a permit.

**Transitional Provisions:**
The law inc