#**Step-by-Step Process with GPT-3.5**

#**1. Install Necessary Libraries**

In [None]:
!pip install faiss-cpu pymupdf openai

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymupdf
  Downloading PyMuPDF-1.24.5-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.33.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.5/325.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting PyMuPDFb==1.24.3 (from pymupdf)
  Downloading PyMuPDFb-1.24.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (15.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.8/15.8 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.0-py3-none-an

In [None]:
pip install --upgrade openai



#**2. Import Libraries and Setup Environment**

In [None]:
import fitz  # PyMuPDF
import nltk
import sqlite3
import torch
import numpy as np
import faiss
import openai
from transformers import AutoTokenizer, AutoModel

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#**3. Define Functions for Database Operations**

In [None]:
def create_database(db_name="documents.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS chunks
                      (id INTEGER PRIMARY KEY, content TEXT)''')
    conn.commit()
    conn.close()
    print(f"Database '{db_name}' created with table 'chunks'.")

def insert_chunks(chunks, db_name="documents.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.executemany("INSERT INTO chunks (content) VALUES (?)", [(chunk,) for chunk in chunks])
    conn.commit()
    conn.close()
    print(f"Inserted {len(chunks)} chunks into 'chunks' table.")

#**4. Define Functions for Text Extraction and Chunking**

In [None]:
def extract_and_chunk_text_from_pdf(pdf_path, chunk_size=200):
    document = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        text += page.get_text()

    sentences = nltk.sent_tokenize(text)
    chunks = [' '.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), chunk_size)]
    print(f"Extracted and chunked text from {pdf_path}. Number of chunks: {len(chunks)}")
    return chunks

#**5. Define Functions for Embedding and Retrieval**

In [None]:
# Load a tokenizer and model for embedding
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
model = AutoModel.from_pretrained('distilbert-base-uncased').to('cuda')  # Move model to GPU

def embed_text(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to('cuda')  # Move inputs to GPU
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()  # Move embeddings to CPU
    return embeddings

# Create FAISS index
index = faiss.IndexFlatL2(768)  # Dimension should match the embedding size

def load_chunks_and_index(db_name="documents.db"):
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("SELECT content FROM chunks")
    chunks = [row[0] for row in cursor.fetchall()]
    conn.close()

    if chunks:
        embeddings = embed_text(chunks)
        index.add(embeddings)
        print(f"Loaded {len(chunks)} chunks and added to FAISS index.")
    else:
        print("No chunks loaded from the database.")

    return chunks

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

#**6. Define Functions for Retrieval and Ranking**

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def retrieve_and_rank(chunks, query, top_k=5):
    query_embedding = embed_text([query])
    distances, indices = index.search(query_embedding, top_k)

    if len(indices[0]) == 0:
        print("No chunks retrieved from the index.")
        return []

    retrieved_chunks = [chunks[i] for i in indices[0] if i < len(chunks)]

    if not retrieved_chunks:
        print("No valid chunks retrieved after filtering.")
        return []

    chunk_embeddings = embed_text(retrieved_chunks)
    similarities = cosine_similarity(query_embedding, chunk_embeddings)[0]
    ranked_chunks = [retrieved_chunks[i] for i in np.argsort(similarities)[::-1]]

    return ranked_chunks

#**7. Define Function to Generate Responses with OpenAI GPT-3.5**

In [None]:
# Set your OpenAI API key
# Source: https://github.com/PawanOsman/ChatGPT?tab=readme-ov-file and https://www.youtube.com/watch?v=giYejigUM9A

openai.api_key = 'anything'
openai.base_url = "http://localhost:3040/v1/"

def generate_response(chunks, query, top_k=5, prompt="Answer the following question based on the provided context:"):
    ranked_chunks = retrieve_and_rank(chunks, query, top_k)

    if not ranked_chunks:
        return "No relevant chunks found to generate a response."

    context = " ".join(ranked_chunks) + "\n" + prompt + "\n" + query

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": context}
        ],
        max_tokens=150,
        temperature=0.7,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )

    return response.choices[0].message['content'].strip()

In [None]:
# # Set your OpenAI API key
# openai.api_key = 'sk-proj-g5UfHgUqPhf5oeQVUepOT3BlbkFJq6EVjYrNdLYKHY1OfL03'
# def generate_response(chunks, query, top_k=5, prompt="Answer the following question based on the provided context:"):
#     ranked_chunks = retrieve_and_rank(chunks, query, top_k)

#     if not ranked_chunks:
#         return "No relevant chunks found to generate a response."

#     context = " ".join(ranked_chunks) + "\n" + prompt + "\n" + query

#     response = openai.ChatCompletion.create(
#         model="gpt-3.5-turbo",
#         messages=[
#             {"role": "system", "content": "You are a helpful assistant."},
#             {"role": "user", "content": context}
#         ],
#         max_tokens=150,
#         temperature=0.7,
#         top_p=1.0,
#         frequency_penalty=0.0,
#         presence_penalty=0.0
#     )

#     return response.choices[0].message['content'].strip()

#**8. Define the RAG Pipeline Function**

In [None]:
def rag_pipeline(pdf_paths, query, top_k=5, chunk_size=200, prompt="Answer the following question based on the provided context:"):
    create_database()
    for pdf_path in pdf_paths:
        chunks = extract_and_chunk_text_from_pdf(pdf_path, chunk_size)
        insert_chunks(chunks)

    chunks = load_chunks_and_index()

    response = generate_response(chunks, query, top_k, prompt)

    return response

#**9. Upload Multiple PDFs**

In [None]:
from google.colab import files

uploaded = files.upload()

pdf_paths = list(uploaded.keys())

Saving Red_Hat_Enterprise_Linux-7-Migration_Planning_Guide-en-US.pdf to Red_Hat_Enterprise_Linux-7-Migration_Planning_Guide-en-US.pdf
Saving Red_Hat_Enterprise_Linux-7-Virtualization_Getting_Started_Guide-en-US.pdf to Red_Hat_Enterprise_Linux-7-Virtualization_Getting_Started_Guide-en-US.pdf
Saving Red_Hat_Enterprise_Linux-7-Virtualization_Security_Guide-en-US.pdf to Red_Hat_Enterprise_Linux-7-Virtualization_Security_Guide-en-US.pdf


#**10.  Run the RAG Pipeline**

In [None]:
queries = [
    "Configuration File Syntax in linux?",
    "network configuration utility (ncat)",
    "Basic requirements and setup for linux?",
    "Why Guest Security Matters in linux"
]

for query in queries:
    response = rag_pipeline(pdf_paths, query)
    print('\n', '\n')
    print('*' * 100)
    print('Query: ', query)
    print('*'*100)
    print('\n')
    print('-'*100)
    print('Response: ', response)
    print('*' * 100)

Database 'documents.db' created with table 'chunks'.
Extracted and chunked text from Red_Hat_Enterprise_Linux-7-Migration_Planning_Guide-en-US.pdf. Number of chunks: 7
Inserted 7 chunks into 'chunks' table.
Extracted and chunked text from Red_Hat_Enterprise_Linux-7-Virtualization_Getting_Started_Guide-en-US.pdf. Number of chunks: 6
Inserted 6 chunks into 'chunks' table.
Extracted and chunked text from Red_Hat_Enterprise_Linux-7-Virtualization_Security_Guide-en-US.pdf. Number of chunks: 5
Inserted 5 chunks into 'chunks' table.
Loaded 18 chunks and added to FAISS index.


APIConnectionError: Connection error.