<a href="https://colab.research.google.com/github/nikitajos7/LLM-RAG-Workshop/blob/main/Nikita_Jos_WIC_LLM_RAG_WORKSHOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Get Started
First thing first, please go to **'File' > 'Save a Copy in Drive'** to create a copy of this notebook in your drive. Otherwise, any changes you make in this parent notebook won't be saved!!


## Some Basic on Using Google Colab Notebook
Noteboook are consist of multiple 'cells' and to run each cell, you can click on the Play button on the left of each cell, or navigate to a cell (click into the cell) and then press 'Shift+Enter'

In [None]:
# Install necessary libraries
!pip -q install chromadb
!pip -q install sentence-transformers
!pip -q install PyPDF2
!pip -q install google.generativeai
!pip install python-dotenv

In [None]:
# Import necessary libraries
import chromadb
from sentence_transformers import SentenceTransformer
import PyPDF2
import google.generativeai as genai
import os
import nltk
from nltk.tokenize import sent_tokenize
from dotenv import load_dotenv
nltk.download('punkt')
load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')

In [None]:
# Read pdf content
def extract_text_from_pdf(file_path):
    with open(file_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in range(len(reader.pages)):
            text += reader.pages[page].extract_text()
        return text

pdf_text = extract_text_from_pdf("syllabus.pdf") #replace with the name of pdf file you imported

In [None]:
# Split the pdf_text by sentences
sentences = sent_tokenize(pdf_text)
print(len(sentences)) # should be greater than 1

In [None]:
# Initialize a ChromaDB client instance and a db collection
client = chromadb.Client()
collection = client.create_collection("my_collection")

In [None]:
def fixed_size_chunking(text, chunk_size=5, overlap_size=2):
    # Group sentences into chunks with overlap
    stride = chunk_size - overlap_size # overlap by 2 sentences
    chunks = [' '.join(sentences[i:i + chunk_size]) for i in range(0, len(sentences), stride)]

    return chunks

chunks = fixed_size_chunking(sentences)

In [None]:
len(chunks) # should be greater than 1

In [None]:
# Download embedding model and use it to embed knowledge base
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose another model https://www.sbert.net/docs/sentence_transformer/pretrained_models.html
documents = chunks
embeddings = model.encode(documents)

# Store embeddings in Chroma DB
for idx, embedding in enumerate(embeddings):
    print(f"===Adding Document {str(idx)} to Collection: ")
    print(documents[idx])
    collection.add(
        ids=[str(idx)],
        documents=[documents[idx]],
        embeddings=[embedding.tolist()]
    )

In [None]:
# Change to different queries!
query = "When is the midterm?"

In [None]:
results = collection.query(
    query_texts=[query],
    n_results=2 # retrieve top 2 most relevant document
)
context = results['documents'][0]
print(context)

In [None]:
# Can also feel free to adjust the prompt as well (prompt tuning!!)
prompt = f"""
Answer the question based only on the following context: {context}
Answer the question based on the above context: {query}
"""
print(prompt)

In [None]:
# Create an Gemini API key at https://aistudio.google.com/app
# DO NOT USE UCSD GMAIL ACCOUNT! (you won't be able to access the AI Studio)
genai.configure(api_key=api_key) # paste in your api key here

# Create a Gemini model instance
model = genai.GenerativeModel("gemini-1.5-pro")

# Generate text
response = model.generate_content(prompt)

# Print the generated text
print(response.text)