In [1]:
import json
import os
import numpy as np
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.http import models
from typing import List, Dict, Any
import nltk
from nltk.corpus import stopwords
import time

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Set up OpenRouter with OpenAI client
OPENROUTER_API_KEY = "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"
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

OPENAI_API_KEY = "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"
embeddings_client = OpenAI(api_key=OPENAI_API_KEY)

# Connect to Qdrant (local or cloud)
qdrant_client = QdrantClient(
    url="https://8b6857da-0682-417b-a31b-2a83bef2cab3.us-east-1-0.aws.cloud.qdrant.io",
    api_key="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.dWjs7ZnPcyo0lbk1tvelYBim14HKNwDm1qfWTKaoVoQ"
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zohai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_json_file(file_path: str) -> List[Dict[str, Any]]:
    """Read JSON file and return its contents."""
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Read the data
file_path = "../data/processed/Class_Schedule.json"
admission_data = read_json_file(file_path)
print(f"Loaded {len(admission_data)} entries from JSON file")

Loaded 18 entries from JSON file


In [3]:
def generate_embedding(text: str) -> List[float]:
    """Generate embedding for a text using OpenAI's text-embeddings-small-3 model."""
    start_time = time.time()
    response = embeddings_client.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    embedding_time = time.time() - start_time
    print(f"Embedding time: {embedding_time} seconds")
    return response.data[0].embedding

def create_payload(entry: Dict[str, Any]) -> Dict[str, Any]:
    """Create a payload with text, keywords, and metadata for Qdrant."""
    text = entry.get("text", "")
    keywords = entry.get("keywords", [])
    
    
    return {
        "text": text,
        "keywords": keywords,
    }

In [4]:
def create_collection(collection_name: str, vector_size: int = 1536):
    """Create a collection in Qdrant if it doesn't exist."""
    try:
        qdrant_client.get_collection(collection_name)
        print(f"Collection {collection_name} already exists")
    except Exception:
        qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(
                size=vector_size,
                distance=models.Distance.COSINE
            )
        )
        print(f"Created collection {collection_name}")

# Create collection for admission data
create_collection("class_schedule")

Created collection class_schedule


In [6]:
def process_and_upload_data(data: List[Dict[str, Any]], collection_name: str):
    """Process each entry, generate embedding, and upload to Qdrant."""
    batch_size = 10  # Process in batches to avoid API rate limits
    
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        
        points = []
        for j, entry in enumerate(batch):
            # Create payload with text and keywords
            payload = create_payload(entry)
            
            # Generate embedding for text content
            embedding = generate_embedding(entry["text"])
            
            # Add to points
            points.append(models.PointStruct(
                id=i+j,
                vector=embedding,
                payload=payload
            ))
        
        # Upload batch to Qdrant
        qdrant_client.upsert(
            collection_name=collection_name,
            points=points
        )
        
        print(f"Uploaded batch {i//batch_size + 1}/{(len(data) + batch_size - 1)//batch_size}")

# Process and upload data
process_and_upload_data(admission_data, "class_schedule")
print("All data processed and uploaded to Qdrant")

Embedding time: 0.6798524856567383 seconds
Embedding time: 1.4361741542816162 seconds
Embedding time: 0.5141494274139404 seconds
Embedding time: 0.609935998916626 seconds
Embedding time: 0.6175282001495361 seconds
Embedding time: 0.41523170471191406 seconds
Embedding time: 0.8901839256286621 seconds
Embedding time: 1.161759376525879 seconds
Embedding time: 0.49821972846984863 seconds
Embedding time: 0.4081695079803467 seconds
Uploaded batch 1/2
Embedding time: 0.5092024803161621 seconds
Embedding time: 0.7208588123321533 seconds
Embedding time: 0.44847536087036133 seconds
Embedding time: 1.3901169300079346 seconds
Embedding time: 0.513908863067627 seconds
Embedding time: 0.6163101196289062 seconds
Embedding time: 1.0241820812225342 seconds
Embedding time: 0.5073039531707764 seconds
Uploaded batch 2/2
All data processed and uploaded to Qdrant


## Simple Searching

In [34]:
def search_qdrant_simple(query: str, collection_name: str, limit) -> List[Dict[str, Any]]:
    """Perform simple search in Qdrant for a single query."""
    # Generate embedding for the query
    embedding = generate_embedding(query)
    
    start_time = time.time()
    # Perform search
    search_results = qdrant_client.search(
        collection_name=collection_name,
        query_vector=embedding,
        limit=limit,
        with_payload=True
    )
    print(search_results)
    print(len(search_results))
    search_time = time.time() - start_time
    print(f"Search time: {search_time} seconds")
    # Format results

    # Format results
    results = []
    for scored_point in search_results:
        results.append({
            "id": scored_point.id,
            "score": scored_point.score,
            "payload": scored_point.payload
        })
    
    return results

In [22]:
def generate_response(query: str, context: List[Dict[str, Any]]) -> str:
    """Generate a response using OpenAI based on retrieved context."""
    # Prepare context text from search results
    start_time = time.time()
    context_text = "\n\n".join([
        f"Document {i+1}:\nText: {item['payload']['text']}\nKeywords: {', '.join(item['payload']['keywords'])}"
        for i, item in enumerate(context)
    ])
    context_time = time.time() - start_time
    print(f"Context time: {context_time} seconds")
    system_prompt = """
    You are an authoritative academic assistant for Notre Dame University (NDU) providing precise class schedule information.
    
    IMPORTANT GUIDELINES:
    1. For schedule queries, provide exact days, times, and course details from the retrieved documents.
    2. Format schedule information clearly with course code, name, days, and times in a structured format.
    3. When multiple schedules are retrieved, prioritize the specific course mentioned in the query.
    4. If schedule information is incomplete or not found in the context, state this explicitly and suggest contacting the registrar.
    5. Use bold formatting for key details like course codes, days, and times.

    Provide only factual information based on the retrieved documents. Never invent or assume schedule details.
    """
    
    user_prompt = f"Question: {query}\n\nContext:\n{context_text}"
    start_time_1 = time.time()
    response = client.chat.completions.create(
        model="openai/gpt-4o",  # Using a powerful model for response generation
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.7,
        max_tokens=500
    )
    response_time = time.time() - start_time_1
    print(f"Response time: {response_time} seconds")

    return response.choices[0].message.content

In [27]:
def rag_pipeline_simple(query: str):
    """Complete RAG pipeline from user query to response."""
    print(f"Original query: {query}")
    
    # Search Qdrant with a single query
    search_results = search_qdrant_simple(query, "class_schedule", limit=6)
    
    # Generate response
    response = generate_response(query, search_results)
    
    return {
        "original_query": query,
        "search_results": search_results,
        "response": response
    }

In [35]:
# Test the pipeline with a sample query
start_time = time.time()
result = rag_pipeline_simple("What classes do I have on tuesday?")
end_time = time.time()
print(f"Total time taken: {end_time - start_time} seconds")
# Display the response
print("\nFinal Response:")
print(result["response"])

Original query: What classes do I have on tuesday?
Embedding time: 1.2459297180175781 seconds


  search_results = qdrant_client.search(


[ScoredPoint(id=12, version=2, score=0.43175754, payload={'text': "Course Schedule: MTH 110 - Pre-Calculus Mathematics\nThe course MTH 110, titled 'Pre-Calculus Mathematics', for the Bachelor of Science in Computer Science program, is scheduled as follows: Sunday from 14:00 to 15:30, Monday from 14:00 to 15:30.", 'keywords': ['MTH 110', 'Pre-Calculus Mathematics', 'schedule', 'timetable', 'class times', 'BSCS', 'Sunday', 'Monday', '14:00-15:30']}, vector=None, shard_key=None, order_value=None), ScoredPoint(id=6, version=1, score=0.42570692, payload={'text': "Course Schedule: CSC 312 - Computer Architecture\nThe course CSC 312, titled 'Computer Architecture', for the Bachelor of Science in Computer Science program, is scheduled as follows: Sunday from 11:00 to 12:30, Monday from 11:00 to 12:30.", 'keywords': ['CSC 312', 'Computer Architecture', 'schedule', 'timetable', 'class times', 'BSCS', 'Sunday', 'Monday', '11:00-12:30']}, vector=None, shard_key=None, order_value=None), ScoredPoint