In [95]:
import json
import os
import numpy as np
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models
from typing import List, Dict, Any
import nltk
from nltk.corpus import stopwords
import time

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Set up OpenRouter with OpenAI client
OPENROUTER_API_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

OPENAI_API_KEY = "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"
embeddings_client = OpenAI(api_key=OPENAI_API_KEY)

# Connect to Qdrant (local or cloud)
qdrant_client = QdrantClient(
    url="https://8b6857da-0682-417b-a31b-2a83bef2cab3.us-east-1-0.aws.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.dWjs7ZnPcyo0lbk1tvelYBim14HKNwDm1qfWTKaoVoQ"
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Embeddings Creation

In [4]:
def read_json_file(file_path: str) -> List[Dict[str, Any]]:
    """Read JSON file and return its contents."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Read the data
file_path = "../data/processed/admission_guide_PDF_extracted_text.json"
admission_data = read_json_file(file_path)
print(f"Loaded {len(admission_data)} entries from JSON file")

Loaded 41 entries from JSON file


In [101]:
def generate_embedding(text: str) -> List[float]:
    """Generate embedding for a text using OpenAI's text-embeddings-small-3 model."""
    start_time = time.time()
    response = embeddings_client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    embedding_time = time.time() - start_time
    print(f"Embedding time: {embedding_time} seconds")
    return response.data[0].embedding

def create_payload(entry: Dict[str, Any]) -> Dict[str, Any]:
    """Create a payload with text, keywords, and metadata for Qdrant."""
    text = entry.get("text", "")
    keywords = entry.get("keywords", [])
    
    
    return {
        "text": text,
        "keywords": keywords,
    }

In [6]:
def create_collection(collection_name: str, vector_size: int = 1536):
    """Create a collection in Qdrant if it doesn't exist."""
    try:
        qdrant_client.get_collection(collection_name)
        print(f"Collection {collection_name} already exists")
    except Exception:
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        print(f"Created collection {collection_name}")

# Create collection for admission data
create_collection("admission_course_guide")

Created collection admission_course_guide


In [8]:
def process_and_upload_data(data: List[Dict[str, Any]], collection_name: str):
    """Process each entry, generate embedding, and upload to Qdrant."""
    batch_size = 10  # Process in batches to avoid API rate limits
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        
        points = []
        for j, entry in enumerate(batch):
            # Create payload with text and keywords
            payload = create_payload(entry)
            
            # Generate embedding for text content
            embedding = generate_embedding(entry["text"])
            
            # Add to points
            points.append(models.PointStruct(
                id=i+j,
                vector=embedding,
                payload=payload
            ))
        
        # Upload batch to Qdrant
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        print(f"Uploaded batch {i//batch_size + 1}/{(len(data) + batch_size - 1)//batch_size}")

# Process and upload data
process_and_upload_data(admission_data, "admission_course_guide")
print("All data processed and uploaded to Qdrant")

Uploaded batch 1/5
Uploaded batch 2/5
Uploaded batch 3/5
Uploaded batch 4/5
Uploaded batch 5/5
All data processed and uploaded to Qdrant


## Searching and Testing Queries

In [9]:
def remove_stop_words(query: str) -> str:
    """Remove stop words from a query."""
    stop_words = set(stopwords.words('english'))
    words = query.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

In [47]:
def generate_query_variations(query: str) -> List[str]:
    """Generate variations of the query using OpenAI."""
    system_prompt = """
    Create one alternative versions of the user's query. 
    Each version should:
    1. Maintain the original meaning
    2. Use different wording or phrasing
    3. Be a complete, well-formed question
    
    Return ONLY two variations, one per line, with no additional text.
    """
    
    response = client.chat.completions.create(
        model="openai/gpt-4.1-nano",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": query}
        ],
        temperature=0.7,
        max_tokens=200
    )
    
    variations_text = response.choices[0].message.content
    variations = [line.strip() for line in variations_text.split('\n') if line.strip()]
    
    # Ensure we have exactly 2 variations
    if len(variations) > 1:
        variations = variations[:1]
    while len(variations) < 1:
        variations.append(query)  # Use original query as fallback
        
    return variations

## Batch Search

In [66]:
def search_qdrant(queries: List[str], collection_name: str, limit: int = 3) -> List[Dict[str, Any]]:
    """Perform batch search in Qdrant for multiple queries."""
    # Generate embeddings for all queries
    query_embeddings = []
    for query in queries:
        embedding = generate_embedding(query)
        query_embeddings.append(embedding)
    
    # Perform batch search
    search_results = qdrant_client.query_batch_points(
        collection_name=collection_name,
        requests=[
            models.QueryRequest(
                query=embedding,
                limit=limit,
                with_payload=True
            )
            for embedding in query_embeddings
        ]
    )
    
    print(search_results)

    # Extract unique results
    unique_results = {}
    for result_batch in search_results:
        # Each result_batch is a QueryResponse with a 'points' attribute
        for scored_point in result_batch.points:  # Access the points attribute
            if scored_point.id not in unique_results:
                unique_results[scored_point.id] = {
                    "score": scored_point.score,
                    "payload": scored_point.payload
                }
    
    # Convert to list and sort by score
    results = [{"id": k, **v} for k, v in unique_results.items()]
    results.sort(key=lambda x: x["score"], reverse=True)
    
    return results[:limit]  # Return top N unique results

## Simple Search

In [183]:
def search_qdrant_simple(query: str, collection_name: str, limit: int = 3) -> List[Dict[str, Any]]:
    """Perform simple search in Qdrant for a single query."""
    # Generate embedding for the query
    embedding = generate_embedding(query)
    
    start_time = time.time()
    # Perform search
    search_results = qdrant_client.query_points(
        collection_name=collection_name,
        query=embedding,
        limit=limit,
        with_payload=True,
        score_threshold=0.3
    )
    print(search_results)
    search_time = time.time() - start_time
    print(f"Search time: {search_time} seconds")
    # Format results

    start_time_1 = time.time()
    results = []
    for scored_point in search_results.points:
        results.append({
            "id": scored_point.id,
            "score": scored_point.score,
            "payload": scored_point.payload
        })
    format_time = time.time() - start_time_1
    print(f"Format time: {format_time} seconds")

    return results

In [184]:
def generate_response(query: str, context: List[Dict[str, Any]]) -> str:
    """Generate a response using OpenAI based on retrieved context."""
    # Prepare context text from search results
    start_time = time.time()
    context_text = "\n\n".join([
        f"Document {i+1}:\nText: {item['payload']['text']}\nKeywords: {', '.join(item['payload']['keywords'])}"
        for i, item in enumerate(context)
    ])
    context_time = time.time() - start_time
    print(f"Context time: {context_time} seconds")
    system_prompt = """
    You are an authoritative academic assistant for Notre Dame University (NDU) providing precise information based on the retrieved documents.
    
    IMPORTANT GUIDELINES:
    1. Provide ONLY ONE definitive answer based on the highest relevance matches in the context.
    2. If multiple potential answers exist, choose the one with the strongest evidence in the retrieved documents.
    3. Focus exclusively on directly answering the user's question with specific facts from the context.
    4. Be extremely precise with numbers, credits, requirements, and program details.
    5. If a direct answer isn't clearly available in the context, state this clearly rather than speculating.
    6. Format your answer concisely using bold for key facts and figures.
    7. Avoid listing multiple possibilities or alternatives unless specifically requested.
    
    Your goal is to provide the single most accurate answer as if you were an official university representative.
    """
    
    user_prompt = f"Question: {query}\n\nContext:\n{context_text}"
    start_time_1 = time.time()
    response = client.chat.completions.create(
        model="openai/gpt-4o-mini",  # Using a powerful model for response generation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.7,
        max_tokens=500
    )
    response_time = time.time() - start_time_1
    print(f"Response time: {response_time} seconds")

    return response.choices[0].message.content

## Batch Search Rag

In [65]:
def rag_pipeline(query: str):
    """Complete RAG pipeline from user query to response."""
    print(f"Original query: {query}")
    
    # # Generate query variations
    # variations = generate_query_variations(query)
    # all_queries = [query] + variations
    
    # print("Query variations:")
    # for i, q in enumerate(all_queries):
    #     print(f"{i+1}. {q}")
    
    # # Remove stop words from all queries
    # filtered_queries = [remove_stop_words(q) for q in all_queries]
    # print("Filtered queries: ",filtered_queries)
    
    # print("After stop word removal:")
    # for i, q in enumerate(filtered_queries):
    #     print(f"{i+1}. {q}")
    
    # Search Qdrant
    search_results = search_qdrant(query, "admission_course_guide", limit=5)
    
    # Generate response
    response = generate_response(query, search_results)
    
    return {
        "original_query": query,
        # "variations": variations,
        # "filtered_queries": filtered_queries,
        "search_results": search_results,
        "response": response
    }

## Simple Search Rag

In [185]:
def rag_pipeline_simple(query: str):
    """Complete RAG pipeline from user query to response."""
    print(f"Original query: {query}")
    
    # Search Qdrant with a single query
    search_results = search_qdrant_simple(query, "admission_course_guide", limit=3)
    
    # Generate response
    response = generate_response(query, search_results)
    
    return {
        "original_query": query,
        "search_results": search_results,
        "response": response
    }

In [187]:
# Test the pipeline with a sample query
start_time = time.time()
result = rag_pipeline_simple("What are the available faculties in the University?")
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds")
# Display the response
print("\nFinal Response:")
print(result["response"])

Original query: What are the available faculties in the University?
Embedding time: 0.9884524345397949 seconds
points=[ScoredPoint(id=7, version=0, score=0.4404493, payload={'text': 'Degrees Offered - Faculty of Humanities (FH)\nUndergraduate:\n- Advertising & Marketing (BA) •○: 102 cr.\n- Basic Education (BA): 99 cr.\n- Communication Arts (BA): Journalism & Electronic Media • (102 cr.), Radio/TV •○ (103 cr.)\n- English Language (BA): 102 cr.\n- Physical Education & Sport (BA): 99 cr.\n- Psychology (BA): Clinical, Educational, Industrial (97 cr. each)\n- Translation & Interpretation (BA): 108 cr.\nTeaching Diploma (TD): 21 cr. each (Arabic Lang. & Lit., Basic Education, Chemistry, Computer Science, English Lang., Life Science, Mathematics, Physical Education & Sport, Physics)\nGraduate:\n- Education (MA): Educational Technology, School Mgt & Edu Leadership, Special Education (33 cr. each)\n- Educational Psychology (MA): 36 cr.\n- English Language & Literature (MA): Applied Linguistics 

In [173]:
# Test the pipeline with a sample query
result = rag_pipeline("Does the University have any program for Master's program?")

# Display the response
print("\nFinal Response:")
print(result["response"])

Original query: Does the University have any program for Master's program?
Embedding time: 0.4438457489013672 seconds
Embedding time: 0.7390108108520508 seconds
Embedding time: 1.2095799446105957 seconds
Embedding time: 0.4253523349761963 seconds
Embedding time: 0.8278212547302246 seconds
Embedding time: 0.702434778213501 seconds
Embedding time: 0.8332405090332031 seconds
Embedding time: 0.7532329559326172 seconds
Embedding time: 0.4740478992462158 seconds
Embedding time: 0.41017961502075195 seconds
Embedding time: 0.585721492767334 seconds
Embedding time: 0.5085210800170898 seconds
Embedding time: 0.42838287353515625 seconds
Embedding time: 0.5225872993469238 seconds
Embedding time: 0.6466877460479736 seconds


KeyboardInterrupt: 