# Get LlamaIndex and import libraries

In [None]:
!pip install llama_index

In [4]:
import os, itertools
import llama_index
import openai
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
from llama_index.core import StorageContext, Settings, SimpleKeywordTableIndex, VectorStoreIndex
from llama_index.core import node_parser
from llama_index.core import Settings
from llama_index.core import retrievers
from llama_index.core import load_index_from_storage


In [5]:
os.environ['OPENAI_KEY'] = [YOUR OPENAI KEY HERE]
openai.api_key = os.environ['OPENAI_KEY']


# Import data

In [None]:
# Download president_wikipedia_articles from https://github.com/nathanbos/blog_embeddings and save them locally
documents = SimpleDirectoryReader("president_wikipedia_articles").load_data()


# Get embedding models


In [None]:
# Get OpenAI embeddings. Will only work if you entered and OpenAI key above..
from llama_index.embeddings.openai import OpenAIEmbedding

ada_embed_model = OpenAIEmbedding(model = "text-embedding-ada-002")  # default
large_embed_model = OpenAIEmbedding(model="text-embedding-3-large")

In [None]:
# get some more embedding models from huggingface
!pip install llama-index-embeddings-huggingface

In [None]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
bge_embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
st_embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

## Test the embed models (optional)

In [None]:
# Could use the embed model directly like this
large_embedding = large_embed_model.get_text_embedding("Who was President during World War 2?")
bge_embedding = bge_embed_model.get_text_embedding("Who was President during World War 2?")
st_embedding = st_embed_model.get_text_embedding("Who was President during World War 2?")
ada_embedding = ada_embed_model.get_text_embedding("Who was President during World War 2?")
print(len(bge_embedding)) # confirm we've got the right embedding
print(len(st_embedding)) # confirm we've got the right embedding
print(len(ada_embedding)) # confirm we've got the right embedding
print(len(large_embedding)) # confirm we've got the right embedding


### Set chunk size to 128.
128 is very small for precise context matches; 512 would work better for a real RAG system.


In [None]:
from llama_index.core.node_parser import SentenceSplitter
Settings.text_splitter = SentenceSplitter(chunk_size=128, chunk_overlap=24)
Settings.chunk_size = 128
Settings.chunk_overlap = 24

# Define indexes
### Indexes are the databases- they store vectors and text passages


In [None]:
# index documents with an embedding model
bge_index128 = VectorStoreIndex.from_documents(documents, embed_model=bge_embed_model)
st_index128 = VectorStoreIndex.from_documents(documents, embed_model=st_embed_model)

In [None]:
ada_index128 = VectorStoreIndex.from_documents(documents, embed_model=ada_embed_model)
large_index128 = VectorStoreIndex.from_documents(documents, embed_model=large_embed_model)

# Save indexes for future use (optional)




In [None]:
bge_index128.storage_context.persist(persist_dir="storage/bge_people_db128")
st_index128.storage_context.persist(persist_dir="storage/st_people_db128")
ada_index128.storage_context.persist(persist_dir="storage/ada_people_db128")
large_index128.storage_context.persist(persist_dir="storage/large_people_db128")

# This is how you would retrieve stored indexes later (optional)

In [None]:
from llama_index.core import load_index_from_storage
# Documentation:  https://docs.llamaindex.ai/en/latest/api_reference/storage/indices_save_load.html

ada_context = StorageContext.from_defaults(persist_dir="storage/ada_people_db128")
ada_index128 = load_index_from_storage(ada_context)

large_context = StorageContext.from_defaults(persist_dir="storage/large_people_db128")
large_index128 = load_index_from_storage(large_context)

st_context = StorageContext.from_defaults(persist_dir="storage/st_people_db128")
st_index128 = load_index_from_storage(st_context)

bge_context = StorageContext.from_defaults(persist_dir="storage/bge_people_db128")
bge_index128 = load_index_from_storage(bge_context)

# Define Retrievers
* Retrievers will return just the matching chunks; if you want an LLM to use these to formulate a response use query engines, below.
* The k parameter controls how many context nodes get returned.

In [None]:
from llama_index.core.retrievers import VectorIndexRetriever

large_retriever128 = VectorIndexRetriever(index=large_index128, embed_model=large_embed_model, similarity_top_k=20)
ada_retriever128 = VectorIndexRetriever(index=ada_index128, embed_model=ada_embed_model, similarity_top_k=20)
st_retriever128 = VectorIndexRetriever(index=st_index128, embed_model=st_embed_model, similarity_top_k=20)
bge_retriever128 = VectorIndexRetriever(index=bge_index128, embed_model=bge_embed_model, similarity_top_k=20)


In [None]:
## Test one
test = st_retriever128.retrieve("Which US President's name rhymes with Sarac Rom-Com-A?")

In [None]:
for node in test:
  print(node.node.text)
  print("---------------- END OF NODE -------------------")

# Or create query engines
The k parameter controls how many context nodes get returned

In [None]:
# define query engine from indexes
bge_query_engine128 = bge_index128.as_query_engine(embed_model=bge_embed_model,similarity_top_k=20)
st_query_engine128 = st_index128.as_query_engine(embed_model=st_embed_model, similarity_top_k=20)
ada_query_engine128 = ada_index128.as_query_engine(embed_model=ada_embed_model,similarity_top_k=20)
large_query_engine128 = large_index128.as_query_engine(embed_model=large_embed_model, similarity_top_k=20)

# Read in some questions


In [None]:
# prompt: Read in an Excel file 'Questions.xlsx' to a Pandas data frame 'qs'

import pandas as pd
qs = pd.read_excel('president_questions.xlsx')


In [None]:
# prompt: Create a new questionID variable in qs sequentially numbered from 1

qs['questionID'] = range(1, len(qs) + 1)

# create a dictionary of qnums and questions
question_dict = qs.set_index('questionID')['Question'].to_dict()


# If you want to use retrievers to just get returned context chunks
query_retrievers = {
    "bge": bge_retriever128,
    "st": st_retriever128,
    "ada": ada_retriever128,
    "large": large_retriever128
}

# If you want to use query engines with an LLM
query_engines = {
    "st": st_query_engine128,
    "bge": bge_query_engine128,
    "ada": ada_query_engine128,
    "large": large_query_engine128,
}


# Query with retrievers (just get context passages)

In [None]:
from itertools import islice
# Initialize an empty list to hold your data
data = []

for qnum, question in question_dict.items():
    try:
      print(f"question number: {qnum}")
      # Loop through each model in the dictionary
      for model, retriever in query_retrievers.items():
        print(model)
        print(f"Model: {retriever}")
        print(f"QUESTION: {question}")
        # Get the response from the query engine
        response = retriever.retrieve(question)
        filenames = [node.node.metadata['file_name'] for node in response]
        chunks_returned = [node.node.text for node in response]
        scores = [node.score for node in response]
        sources = [f"{filename}\n score:{score}\n text: {chunk}" for filename, chunk, score in zip(filenames, chunks_returned, scores)]
        sources += [''] * (20 - len(sources))  # Pad the list if it has less than 10 sources
        # Construct the row with qNum, embed_model, llm_response, and sources
        row = [str(qnum), question, model] + sources
        print(row)
        # Append the row to your data list
        data.append(row)
    except Exception as e:
      print(f"An error occurred with model {model} for question {qnum}: {e}")
# Define column names for your DataFrame
column_names = ['qNum', 'question', 'embed_model'] + [f'source_{i}' for i in range(1, 21)]

# Create the DataFrame
output = pd.DataFrame(data, columns=column_names)

output.to_excel('President_answers.xlsx')

# Query with index (get LLM response)

In [None]:
from itertools import islice
# Initialize an empty list to hold your data
data = []

#for qnum, question in islice(question_dict.items(), 2):
for qnum, question in question_dict.items():
    try:
      print(f"question number: {qnum}")
      for model, engine in query_engines.items():
        print(model)
        # Get the response from the query engine
        response = engine.query(question)
        llm_response = response.response  # Example access to a hypothetical response part
        filenames = [node.metadata['file_name'] for node in response.source_nodes if 'file_name' in node.metadata]
        chunks_returned = [node.text for node in response.source_nodes]
        sources = [f"{filename}: {chunk}" for filename, chunk in zip(filenames, chunks_returned)]
        sources += [''] * (20 - len(sources))  # Pad the list if it has less than 20 sources
        # Construct the row with qNum, embed_model, llm_response, and sources
        row = [str(qnum), question, model, llm_response] + sources
        print(row)
        # Append the row to your data list
        data.append(row)
    except Exception as e:
      print(f"An error occurred with model {model} for question {qnum}: {e}")
# Define column names for your DataFrame
column_names = ['qNum', 'question', 'embed_model', 'llm_response'] + [f'source_{i}' for i in range(1, 21)]

# Create the DataFrame
output = pd.DataFrame(data, columns=column_names)

output.to_excel('President_answers.xlsx')

# If you want to get your own Wikipedia article set
* Will need a Wikipedia account
* Can substitute a different list for presidents. Make sure the names match the Wikipedia article names.

In [None]:
presidents = ["John Adams", "John Quincy Adams", "Chester A. Arthur", "Joe Biden", "James Buchanan", "George H. W. Bush",
 "George W. Bush", "Jimmy Carter", "Grover Cleveland", "Bill Clinton", "Calvin Coolidge", "Dwight D. Eisenhower",
 "Millard Fillmore", "Gerald Ford", "James A. Garfield", "Ulysses S. Grant", "Warren G. Harding", "Benjamin Harrison",
 "William Henry Harrison", "Rutherford B. Hayes", "Herbert Hoover", "Andrew Jackson", "Thomas Jefferson", "Andrew Johnson",
 "Lyndon B. Johnson", "John F. Kennedy", "Abraham Lincoln", "James Madison", "William McKinley", "James Monroe", "Richard Nixon",
 "Barack Obama", "Franklin Pierce", "James K. Polk", "Ronald Reagan", "Franklin D. Roosevelt", "Theodore Roosevelt",
 "William Howard Taft", "Zachary Taylor", "Harry S. Truman", "Donald Trump", "John Tyler", "Martin Van Buren",
 "George Washington", "Woodrow Wilson"]

In [None]:
def fetch_wikipedia_page(title, user_agent):
    """
    Fetches the content of a Wikipedia page given its title.
    """
    URL = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True
    }
    headers = {
        'User-Agent': user_agent
    }
    response = requests.get(URL, headers=headers, params=params)
    data = response.json()
    page = next(iter(data['query']['pages'].values()))
    return page['extract'] if 'extract' in page else "No content available"

In [None]:
def save_text(filename, content):
    """
    Saves the given content to a text file with the specified filename.
    """
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(content)

In [None]:
# User agent
user_agent = [YOUR WIKIPEDIA ACCOUNT INFO HERE]

# Directory to save the files
save_dir = 'president_wikipedia_articles'

# Process each president
for president in presidents:
    filename = f"{president.replace('.', '').replace(' ', '_')}.txt"
    path = os.path.join(save_dir, filename)
    content = fetch_wikipedia_page(president, user_agent)
    save_text(path, content)
    print(f"Saved: {path}")