In [1]:
#Imports
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import openai
import pickle
from typing import List, Dict

The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.


  from tqdm.autonotebook import tqdm, trange


In [4]:
#Loading existing knowledge base
df = pd.read_csv('knowledge_base.csv')
print(f"Loaded {len(df)} chunks from the knowledge base")

# Create embeddings
def create_embeddings(texts: List[str]) -> np.ndarray:
    """Create embeddings for a list of texts using Sentence Transformers."""
    print("Loading the embedding model...")
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    print("Creating embeddings...")
    embeddings = model.encode(texts, show_progress_bar=True)
    
    print(f"Created embeddings with shape: {embeddings.shape}")
    return embeddings, model

# Create embeddings for our knowledge base
print("Creating embeddings for the knowledge base...")
embeddings, model = create_embeddings(df['text'].tolist())
# Save embeddings and dataframe
print("Saving embeddings...")
with open('cuda_embeddings.pkl', 'wb') as f:
    pickle.dump((df, embeddings), f)

Loaded 595 chunks from the knowledge base
Creating embeddings for the knowledge base...
Loading the embedding model...
Creating embeddings...


Batches:   0%|          | 0/19 [00:00<?, ?it/s]

Created embeddings with shape: (595, 384)
Saving embeddings...


In [6]:
def setup_faiss_index(embeddings: np.ndarray) -> faiss.IndexFlatL2:
    """Create and populate a FAISS index for similarity search."""
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings).astype('float32'))
    return index

def search_knowledge_base(query: str, 
                         model: SentenceTransformer,
                         index: faiss.IndexFlatL2,
                         df: pd.DataFrame,
                         top_k: int = 3) -> List[Dict]:
    """Search for relevant CUDA documentation based on the query."""
    # Create query embedding
    query_embedding = model.encode([query])[0]
    
    # Search in FAISS
    distances, indices = index.search(np.array([query_embedding]).astype('float32'), top_k)
    
    # Get the text of the most similar chunks
    results = [
        {
            'text': df.iloc[idx]['text'],
            'source': df.iloc[idx]['source'],
            'similarity': 1 - dist/2  # Convert L2 distance to similarity score
        }
        for dist, idx in zip(distances[0], indices[0])
    ]
    
    return results

# Create FAISS index
print("Setting up FAISS index...")
index = setup_faiss_index(embeddings)

# Let's test the search with some CUDA-related queries
test_queries = [
    "What is CUDA and how does it work?",
    "Explain CUDA thread hierarchy",
    "How does CUDA memory management work?"
]

def test_search_functionality(queries: List[str]):
   """
   Test search functionality with multiple queries and display results.
   
   Args:
       queries (List[str]): List of queries to test
   """
   print("\nTesting search functionality:")
   for query in queries:
       print(f"\nQuery: {query}")
       results = search_knowledge_base(query, model, index, df)
       print("\nRelevant passages:")
       for i, result in enumerate(results, 1):
           print(f"\n{i}. Text: {result['text'][:200]}...")  # Show first 200 characters
           print(f"Source: {result['source']}")
           print(f"Similarity Score: {result['similarity']:.3f}")

# Define test queries
test_queries = [
   "What is CUDA and how does it work?",
   "Explain CUDA thread hierarchy",
   "How does CUDA memory management work?"
]
test_search_functionality(test_queries)


Setting up FAISS index...

Testing search functionality:

Query: What is CUDA and how does it work?

Relevant passages:

1. Text: C Programming Guide Release 12 8 498 Chapter 19 Compute Capabilities Chapter 20 Driver API This section assumes knowledge of the concepts described in CUDA Runtime The driver API is implemented in the...
Source: CUDA Documentation
Similarity Score: 0.642

2. Text: introduces the low level driver API CUDA Environment Variables lists all the CUDA environment variables Unified Memory Programming introduces the Unified Memory programming model 9 CUDA C Programming ...
Source: CUDA Documentation
Similarity Score: 0.637

3. Text: cudaSetDevice will now explicitly initialize the runtime after changing the current device for the host thread Previous versions of CUDA delayed runtime initialization on the new device until the firs...
Source: CUDA Documentation
Similarity Score: 0.588

Query: Explain CUDA thread hierarchy

Relevant passages:

1. Text: Child Grids have 

In [20]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

def generate_response_hf(query: str, relevant_chunks: List[Dict], model_name="google/flan-t5-small"):
    """
    Generate response using Hugging Face T5 model
    """
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
        
        # Prepare context and prompt
        context = "\n\n".join([chunk['text'] for chunk in relevant_chunks])
        prompt = f"""Use the following CUDA documentation to answer the question.
        
        Documentation:
        {context}

        Question: {query}

        Answer:"""
        
        # Tokenize and generate
        inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
        outputs = model.generate(**inputs, max_length=200, num_beams=4, temperature=0.7)
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return response
    except Exception as e:
        return f"Error generating response: {str(e)}"
# Now let's create a complete function that combines search and response generation
def cuda_rag_response(query: str, top_k: int = 3) -> Dict:
    """
    Complete RAG pipeline: Search relevant chunks and generate a response.
    
    Args:
        query (str): User's question
        top_k (int): Number of relevant chunks to retrieve
    
    Returns:
        Dict: Contains the generated answer and the relevant chunks used
    """
    # Get relevant chunks
    relevant_chunks = search_knowledge_base(query, model, index, df, top_k=top_k)
    
    # Generate response
    answer = generate_response(query, relevant_chunks)
    
    return {
        "query": query,
        "answer": answer,
        "relevant_chunks": relevant_chunks
    }

# To use this, you'll need to set your OpenAI API key
# openai.api_key = "your-api-key-here"

def test_local_rag_system(queries: List[str]):
    for query in queries:
        print(f"\n\nQuestion: {query}")
        print("-" * 80)
        
        # Get relevant chunks
        relevant_chunks = search_knowledge_base(query, model, index, df)
        
        # Generate response using local model
        response = generate_response_hf(query, relevant_chunks)
        
        print("\nAnswer:")
        print(response)
        
        print("\nRelevant Documentation Used:")
        for i, chunk in enumerate(relevant_chunks, 1):
            print(f"\n{i}. {chunk['text'][:200]}...")

In [21]:
test_queries = [
    "What is CUDA and how does it work?",
    "Explain how CUDA threads are organized",
    "What are the different types of memory in CUDA?"
]

test_local_rag_system(test_queries)



Question: What is CUDA and how does it work?
--------------------------------------------------------------------------------


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]




Answer:
CUDA Runtime The driver API is implemented in the cuda dynamic library cuda dll or cuda so which is copied on the system during the installation of the device driver All its entry points are prefixed with cu It is a handle based imperative API Most objects are referenced by opaque handles that may be spec ified to functions to manipulate the objects The objects available in the driver API are summarized in Table 22 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function Table 24 Object Device Context Module Function

Relevant Documen