# HubermanAI: Using RAG Based LLM

In [1]:
import os
from dotenv import load_dotenv

# Load the environment variables
load_dotenv()

MONGODB_CONNECTION_STRING = os.getenv("MONGODB_URI")
MONGODB_DATABASE = os.getenv("MONGODB_DATABASE")
MONGODB_COLLECTION = os.getenv("MONGODB_COLLECTION")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


## Text extraction from Newsletter PDFs

In [2]:
import os
import re
import fitz  # PyMuPDF

# Function to preprocess and clean text
def preprocess_text_mupdf(text):
    text = re.sub(r'\n\s*\n', '\n', text)  # Remove empty lines
    text = re.sub(r'[^A-Za-z0-9.,;:!?()\'\"\n]+', ' ', text)  # Remove special characters but keep punctuation
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    return text.strip()

# Extract the ZIP file
raw_path = '/Users/mrinoyb2/git/Huberman_AI/Data/Raw/Newsletters'
clean_text_dir = '/Users/mrinoyb2/git/Huberman_AI/Data/Clean'
os.makedirs(clean_text_dir, exist_ok=True)

# Process each PDF file in the extracted directory
for root, dirs, files in os.walk(raw_path):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)
            pdf_document = fitz.open(pdf_path)

            cleaned_text_mupdf = ""
            for page_number in range(pdf_document.page_count):
                page = pdf_document.load_page(page_number)
                text = page.get_text()
                cleaned_text_mupdf += preprocess_text_mupdf(text)

            pdf_document.close()

            # Save the cleaned text to a corresponding file
            clean_text_path = os.path.join(clean_text_dir, os.path.splitext(file)[0] + '.txt')
            with open(clean_text_path, 'w') as file:
                file.write(cleaned_text_mupdf)



## Store chunks in MongoDB database

In [3]:
import os
import pymongo
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')  # Download the Punkt tokenizer models


# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
collection = db[MONGODB_COLLECTION]

# Function to chunk text by sentence
def chunk_by_sentence(text):
    """
    Improved function to chunk text by sentence using NLTK's sent_tokenize.
    
    Args:
    text (str): The input text to chunk.
    
    Returns:
    list: A list of sentences extracted from the input text.
    """
    sentences = sent_tokenize(text)
    return [sentence.strip() for sentence in sentences]

# Path to the directory containing cleaned text files
clean_text_dir = '/Users/mrinoyb2/git/Huberman_AI/Data/Clean'

# Initialize a counter for unique MongoDB document IDs
doc_id = 0

# Iterate through each cleaned text file
for file_name in os.listdir(clean_text_dir):
    if file_name.endswith('.txt'):
        file_path = os.path.join(clean_text_dir, file_name)

        # Read the cleaned text from the file
        with open(file_path, 'r') as file:
            cleaned_text = file.read()

        # Chunk the text
        chunks = chunk_by_sentence(cleaned_text)

        # Store chunks in MongoDB
        for chunk in chunks:
            # Create a document for each chunk
            document = {"_id": doc_id, "text": chunk}
            # Insert the document into the collection
            collection.insert_one(document)
            # Increment the doc_id for the next document
            doc_id += 1

print(f"Total chunks stored in MongoDB: {doc_id}")


[nltk_data] Downloading package punkt to /Users/mrinoyb2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total chunks stored in MongoDB: 960


## Implement RAG

### Create word embeddings

In [4]:
from sentence_transformers import SentenceTransformer
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Load the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to update documents with embeddings
def update_documents_with_embeddings():
    for document in chunks_collection.find():
        # Generate embedding
        embedding = model.encode(document['text'], convert_to_tensor=False)
        # Update document with embedding
        chunks_collection.update_one({'_id': document['_id']}, {'$set': {'embedding': embedding.tolist()}})

# Uncomment the following line to run the embedding update
update_documents_with_embeddings()

# Check the first document to see if the embedding was added
print(chunks_collection.find_one())


{'_id': 0, 'text': '2 9 24, 5:29 PM Using Light for Health Huberman Lab https: www.hubermanlab.com newsletter using light for health 1 11 Neural Network Newsletter Using Light for Health Using Light for Health January 24, 2023 Thank you for joining the Huberman Lab Neural Network a once a month newsletter with science and science related tools for everyday life.', 'embedding': [-0.028163380920886993, -0.015030195005238056, 0.025770673528313637, 0.05299123749136925, 0.050226956605911255, 0.027826806530356407, -0.082654669880867, 0.05679101124405861, -0.031691670417785645, -0.023093437775969505, 0.03952784463763237, 0.001871513668447733, -0.01179493311792612, -0.00349311251193285, -0.03213930502533913, 0.0322083942592144, -0.018941238522529602, -0.03637561947107315, -0.007522422820329666, -0.010602681897580624, -0.009080360643565655, -0.0639241561293602, 0.046462662518024445, -0.015782494097948074, -0.031212473288178444, -0.002136523136869073, 0.05953914299607277, -0.04980574920773506, 0

### Semantic search retrieval

In [5]:
from sentence_transformers import SentenceTransformer
import pymongo
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Connect to MongoDB
client = pymongo.MongoClient(MONGODB_CONNECTION_STRING)
db = client[MONGODB_DATABASE]
chunks_collection = db[MONGODB_COLLECTION]

# Function to perform semantic search
def semantic_search(query, top_k=5):
    # Generate query embedding
    model = SentenceTransformer('all-MiniLM-L6-v2')
    query_embedding = model.encode(query, convert_to_tensor=False)
    
    # Retrieve all embeddings from MongoDB and calculate similarity
    similarities = []
    for document in chunks_collection.find():
        doc_embedding = np.array(document['embedding'])
        # Use scikit-learn's cosine_similarity function to calculate the cosine distance
        similarity = cosine_similarity([query_embedding], [doc_embedding])[0][0]
        # Store document ID and similarity score
        similarities.append((document['_id'], similarity, document['text']))
    
    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return top_k most similar documents
    return similarities[:top_k]

# Example usage
query = "Best sleep protocol?"
results = semantic_search(query)
for idx, (doc_id, similarity, text) in enumerate(results, start=1):
    print(f"Result {idx} (Score: {similarity:.3f}): {text}...")  # Print the first 100 characters for brevity



Result 1 (Score: 0.611): See Newsletter 1 (Toolkit for Sleep) for details....
Result 2 (Score: 0.611): Episode 2 of the Huberman Lab Podcast JMaster Your Sleep) is all about that topic, but I wanted to provide a succinct list of the key things for sleep....
Result 3 (Score: 0.567): So whatever your life and goals and schedule, master your sleep....
Result 4 (Score: 0.552): I shift to using red lights around 9 p.m., and it has greatly improved my sleep....
Result 5 (Score: 0.547): As do most sleep medications....


## Setting up Gemini Pro

In [6]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [7]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.

genai.configure(api_key=GEMINI_API_KEY)

In [18]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-pro
models/gemini-pro-vision


## Connect LLM model

In [13]:
# Function to generate an answer using RAG enabled LLama2 from Replicate
def generate_RAG_answer(question, max_context_length=1000):
    # Assume semantic_search is defined and returns relevant context as a single string
    context_results = semantic_search(question, top_k=5)
    # Concatenate the top-k context results into a single string
    context = ' '.join([result[2] for result in context_results])
    # Truncate context if it exceeds the maximum length
    if len(context) > max_context_length:
        context = context[:max_context_length]

    rag_prompt = f"[INST]\nQuestion: {question}\nContext: {context}\n[/INST]"
    print(rag_prompt)

    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(rag_prompt)
    # return text part of the response
    return response.text

# Function to generate an answer using LLama2 from Replicate
def generate_non_RAG_answer(question):
    # Assume semantic_search is defined and returns relevant context as a single string
    non_rag_prompt = f"[INST]\nQuestion: {question}\n[/INST]"  # Fallback in case no context is found
    print(non_rag_prompt)
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(non_rag_prompt)
    return response.text

In [15]:
# Example query
query = "You weild the knowledge of Dr. Andrew Huberman. Best sleep protocol?"
rag_answer = generate_RAG_answer(query)

print(rag_answer)

[INST]
Question: You weild the knowledge of Dr. Andrew Huberman. Best sleep protocol?
Context: 2 Huberman Lab Podcast, HLP interview w Matt Walker hubermanlab.com Huberman Lab The Huberman Lab podcast discusses science and scie 9 40 PM Sep 15, 2021 2.7K Reply Copy link Read 104 replies2 9 24, 5:33 PM Toolkit for Sleep Huberman Lab https: www.hubermanlab.com newsletter toolkit for sleep 3 10 It ended up being shared more than any other post, which told me that people likely want tools for getting better at sleeping. Enter your email Subscribe2 9 24, 5:33 PM Toolkit for Sleep Huberman Lab https: www.hubermanlab.com newsletter toolkit for sleep 9 10 Thank you for your interest in science In the world s 1 health podcast, Dr. Andrew Huberman explores science and science based tools to help listeners live a healthier, more fulfilling life. What an amazing wealth of2 9 24, 5:33 PM Toolkit for Sleep Huberman Lab https: www.hubermanlab.com newsletter toolkit for sleep 10 10 Podcast Newsletter T

In [10]:
# Example query
query = "Best sleep protocol?"
non_rag_answer = generate_non_RAG_answer(query)

print(non_rag_answer)

[INST]
Question: Best sleep protocol?
[/INST]
**Best Sleep Protocol**

**1. Establish a Regular Sleep-Wake Cycle**

* Go to bed and wake up around the same time each day, even on weekends.
* This helps regulate your body's natural sleep-wake cycle.

**2. Create a Relaxing Bedtime Routine**

* Engage in calming activities before bed, such as taking a warm bath, reading, or listening to soothing music.
* Avoid screen time (from phones, tablets, or computers) for at least an hour before bed.

**3. Optimize Your Sleep Environment**

* Make sure your bedroom is dark, quiet, and cool (around 60-67°F).
* Use blackout curtains, earplugs, or a white noise machine to minimize distractions.

**4. Get Regular Exercise**

* Aim for at least 30 minutes of moderate-intensity exercise most days of the week.
* However, avoid vigorous exercise close to bedtime, as it can interfere with sleep.

**5. Avoid Caffeine and Alcohol Before Bed**

* Caffeine and alcohol can disrupt your sleep cycle and make it h

## Evaluating RAG vs Non-RAG Approach

In order to quantify this, I will take the following steps:
1. Think of a query which has a well defined answer in the book.
2. Find the true answer to a query from the actual source (book pdf). 
3. Then pass the query through the RAG based LLM and the regular LLM.
4. Save both generated answers along with the true answer and find word embeddings for each. 
5. Finally, compare the word embeddings of the true answer with the RAG vs Non-RAG based LLM word embeddings using cosine similarity. 

Following these steps will help quantify the performace and accuracy of information in the two answers.

In [33]:
query = "Best sleep protocol?"

# True Answer from the Huberman Lab Newsletter
true_answer = """1) View sunlight by going outside within 30-60 minutes of waking. Do that again in the late afternoon, prior to sunset. If you wake up before the sun is out and you want to be awake, turn on artificial lights and then go outside once the sun rises.

On bright cloudless days: view morning and afternoon sun for 10 min; cloudy days: 20 min; very overcast days 30-60 min. If you live someplace with very minimal light, consider an artificial daytime simulator source.

Don’t wear sunglasses for this practice if you safely can, but contact lenses and eyeglasses are fine.

No, you don’t have to look directly at the sun, and never look at ANY light so bright it is painful to view! That said, you can’t wear a brimmed hat, sunglasses and remain in the shade and expect to “wake up” your circadian clock.
​
​2) Wake up at the same time each day and go to sleep when you first start to feel sleepy. Pushing through the sleepy late evening feeling and going to sleep too late (for you) is one reason people wake at 3 am and can’t fall back asleep.

3) Avoid caffeine within 8-10 hours of bedtime. Dr. Matt Walker (sleep expert from UC Berkeley) might even say 12-14 hours. I do fine with caffeine at 2 pm and I go to sleep at ~10-11 pm. Dr. Walker was on the Huberman Lab Podcast and we discussed this in detail.

4) If you have sleep disturbances, insomnia, or anxiety about sleep, try the research-supported protocols on the Reveri app (for iPhone). Do the Reveri sleep self-hypnosis 3x a week at any time of day. It’s only 10-15 min long and will help you rewire your nervous system to be able to relax faster.

5) Avoid viewing bright lights—especially bright overhead lights between 10 pm and 4 am. Here is a simple rule: only use as much artificial lighting as is necessary for you to remain and move about safely at night. Blue blockers can help a bit at night but still dim the lights. Viewing bright lights of all colors are a problem for your circadian system. Candlelight and moonlight are fine. (Shift workers should see the Huberman Lab Podcast on jetlag for offsetting shift work negative effects. Same for jetlagged travelers.)

6) Limit daytime naps to less than 90 min, or don’t nap at all. I love naps as do many of my colleagues. I tend to nap for 30 min most afternoons… maybe 45 min, but never longer.

7) If you wake up in the middle of the night (which, by the way, is normal to do once or so each night) but you can’t fall back asleep, consider doing an NSDR protocol when you wake up. Enter “NSDR” into YouTube and the top 3-4 options have different voices, durations for you to select from. Or simply do a “Yoga Nidra” protocol (enter “yoga nidra” to YouTube; 100s to select.)

8) You might consider taking (30-60 min before bed):

145mg Magnesium Threonate or 200mg Magnesium Bisglycinate
50mg Apigenin
100-400mg Theanine
(3-4 nights per week I also take 2g of Glycine and 100mg GABA.)
*I would start with one supplement (or none!) and then add one at a time as needed. Some people do not need any supplements, and some people like theanine but not magnesium, etc. so you have to determine what is best for you.

**Don’t take theanine if you have overly intense dreams, sleep-walk, or have night terrors.

***Also, some people (~5%), get an agitated stomach from magnesium supplementation, in which case, do not take it.

****I use supplements from Momentous for all of the above. You can get 20% off all Momentous supplements at https://www.livemomentous.com/huberman or you can pick another source you like and trust.

9) Expect to feel really alert ~1 hour before your natural bedtime. This is a naturally occurring spike in wakefulness that sleep researchers have observed.

Don’t freak out if it happens. It will pass!

10) Keep the room you sleep in cool and dark and layer on blankets that you can remove.

Your body needs to drop in temperature by 1-3 degrees to fall and stay asleep effectively. Body temperature increases are one reason you wake up. Thus, keep your room cool and remove blankets as needed. If it’s too hot you would have to use a cooling device and that’s harder than simply tossing off blankets if you get too warm."""

### Implementing functions to evaluate performance. 

In [34]:
import torch

def get_embedding(text):
    """
    Generate an embedding for a given text.

    Args:
    - text (str): The input text.
    
    Returns:
    - The sentence embedding.
    """
    # Load the sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # Generate the sentence embeddings
    embeddings = model.encode(text, convert_to_tensor=False)
    return embeddings


def calculate_cosine_similarity(embedding1, embedding2):
    """
    Calculate the cosine similarity between two embeddings.

    Args:
    - embedding1 (torch.Tensor): The first embedding.
    - embedding2 (torch.Tensor): The second embedding.

    Returns:
    - The cosine similarity score.
    """
    # Calculate the cosine similarity
    similarity = 1 - cosine(embedding1, embedding2)
    return similarity


def calculate_similarity_scores(true_answer, rag_answer, non_rag_answer):
    """
    Calculate the cosine similarity scores between the true answer and both RAG-based and non-RAG-based answers.

    Args:
    - true_answer (str): The true answer text.
    - rag_answer (str): The RAG-based model's answer text.
    - non_rag_answer (str): The non-RAG-based model's answer text.

    Returns:
    - A dictionary with cosine similarity scores.
    """
    # Convert the answers to embeddings
    true_answer_embedding = get_embedding(true_answer)
    rag_answer_embedding = get_embedding(rag_answer)
    non_rag_answer_embedding = get_embedding(non_rag_answer)
    
    # Calculate cosine similarity scores
    rag_similarity = calculate_cosine_similarity(true_answer_embedding, rag_answer_embedding)
    non_rag_similarity = calculate_cosine_similarity(true_answer_embedding, non_rag_answer_embedding)

    
    # Return the scores
    return {
        "RAG Similarity Score": rag_similarity,
        "Non-RAG Similarity Score": non_rag_similarity
    }


# Calculate the similarity scores
similarity_scores = calculate_similarity_scores(true_answer, rag_answer, non_rag_answer)
print(similarity_scores)


{'RAG Similarity Score': 0.603172833011108, 'Non-RAG Similarity Score': 0.5438751740418429}
