In [1]:
# %pip install ragas openai datasets python-dotenv langchain_community 
# %pip install protobuf==3.20.0
# %pip install langchain-core

# %pip install langchain-openai
# %pip install --upgrade langchain
# %pip install selenium
# %pip install unstructured
# %pip install sentence_transformers

In [83]:
# %pip install webdriver_manager
#%pip install chromadb

In [7]:
import os
import openai
from ragas import evaluate
from datasets import Dataset 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from ragas.metrics.critique import harmfulness
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

In [9]:
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key

urls = [
    "https://en.wikipedia.org/wiki/New_York_City",
    "https://en.wikipedia.org/wiki/Snow_leopard",
    "https://www.britannica.com/place/Galapagos-Islands",
    "https://www.birdlife.org/birds/penguins/#:~:text=The%20threats%20are%20numerous%2C%20including,is%20melting%20before%20their%20eyes."
]

In [84]:
from langchain.document_loaders import SeleniumURLLoader
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

def load_content_with_selenium(url):
    # Setup Chrome options for faster performance
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run headless for speed
    chrome_options.add_argument("--disable-gpu")  # Disable GPU for headless
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    # Create a WebDriver with a timeout
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.set_page_load_timeout(10)  # Set a page load timeout of 10 seconds
    driver.implicitly_wait(5)  # Set an implicit wait of 5 seconds
    #driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.set_page_load_timeout(10)  # Set a timeout of 10 seconds
    try:
        driver.get(url)
        # Now pass the loaded page source to a loader or process as needed
        page_source = driver.page_source
        # Example: If you want to use the SeleniumURLLoader after loading:
        loader = SeleniumURLLoader(urls=[url])
        documents = loader.load()
        return documents
    except Exception as e:
        print(f"Error loading {url}: {e}")
        return None
    finally:
        driver.quit()

# List of URLs to scrape
urls = [
    "https://en.wikipedia.org/wiki/New_York_City",
    "https://en.wikipedia.org/wiki/Snow_leopard",
   # "https://www.britannica.com/place/Galapagos-Islands",
    "https://www.birdlife.org/birds/penguins/#:~:text=The%20threats%20are%20numerous%2C%20including,is%20melting%20before%20their%20eyes."
]

# Load content for each URL
documents = []
for url in urls:
    docs = load_content_with_selenium(url)
    if docs:
        documents.extend(docs)

In [85]:
documentList = []
for doc in documents:
    d = str(doc.page_content).replace("\\n", " ").replace("\\t"," ").replace("\n", " ").replace("\t", " ")
    documentList.append(d)

In [86]:

from langchain_community.embeddings import AzureOpenAIEmbeddings, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
all_texts = []

embeddings_deployment = "text-embedding-3-large"
embeddings = OpenAIEmbeddings(
    model=embeddings_deployment,
    # With the `text-embedding-3` class
    # of models, you can specify the size
    # of the embeddings you want returned.
    # dimensions=1024
)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=50,
)
texts = text_splitter.split_text(''.join(documentList))

# Add the chunks and metadata to the list
all_texts.extend(texts)


In [87]:
vector_store = Chroma.from_texts(
    all_texts, embeddings#, metadatas=metadatas
)

In [88]:
PROMPT_TEMPLATE = """
Go through the context and answer given question strictly based on context. 
Context: {context}
Question: {question}
Answer:
"""

qa_chain = RetrievalQA.from_chain_type(
        llm = ChatOpenAI(temperature=0),
        # retriever=vector_store.as_retriever(search_kwargs={'k': 3}),
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate.from_template(PROMPT_TEMPLATE)}
    )
qa_chain = RetrievalQA.from_chain_type(llm=ChatOpenAI(),
                                       chain_type="stuff",
                                       retriever=vector_store.as_retriever(search_kwargs={"k": 1}),
                                       return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate.from_template(PROMPT_TEMPLATE)})

In [89]:
queries = [
    "Who discovered the Galapagos Islands and how?",
    "What is Brooklyn–Battery Tunnel?",
    "Are Penguins found in the Galapagos Islands?",
    "How many languages are spoken in New York?",
    "In which countries are snow leopards found?",
    "What are the threats to penguin populations?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "How did Galapagos Islands get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
    
]

ground_truths = [
    "The Galapagos Islands were discovered in 1535 by the bishop of Panama, Tomás de Berlanga, whose ship had drifted off course while en route to Peru. He named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters. ",
    "The Brooklyn-Battery Tunnel (officially known as the Hugh L. Carey Tunnel) is the longest continuous underwater vehicular tunnel in North America and runs underneath Battery Park, connecting the Financial District in Lower Manhattan to Red Hook in Brooklyn.[586]",
    "Penguins live on the galapagos islands side by side with tropical animals.",
    "As many as 800 languages are spoken in New York.",
    "Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, Kazakhstan, Afghanistan, Pakistan, India, Nepal, Bhutan, Mongolia, and Tibet.",
    "The threats are numerous, including habitat loss, pollution, disease, and reduced food availability due to commercial fishing. Climate change is of particular concern for many species of penguin, as the sea ice that they depend on to find food or build nests is melting before their eyes.",
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "Tomás de Berlanga, who discovered the islands, named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
    
]

In [90]:
results = []
contexts = []
for query in queries:
    result = qa_chain.invoke({"query": query})
   
    results.append(result['result'])
    sources = result["source_documents"]
    contents = []
    for i in range(len(sources)):
        contents.append(sources[i].page_content)
    contexts.append(contents)

In [91]:
result.keys()

dict_keys(['query', 'result', 'source_documents'])

In [92]:
d = {
    "question": queries,
    "answer": results,
    "contexts": contexts,
    "ground_truth": ground_truths
}

dataset = Dataset.from_dict(d)
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness, harmfulness])
score_df = score.to_pandas()
score_df.to_csv("EvaluationScores.csv", encoding="utf-8", index=False)

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

In [93]:
score_df[['faithfulness','answer_relevancy', 'context_precision', 'context_recall',
       'context_entity_recall', 'answer_similarity', 'answer_correctness',
       'harmfulness']].mean(axis=0)

faithfulness             0.987762
answer_relevancy         0.756036
context_precision        0.800000
context_recall           0.683333
context_entity_recall    0.335000
answer_similarity        0.917463
answer_correctness       0.678872
harmfulness              0.000000
dtype: float64

In [94]:
score_df

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall,context_entity_recall,answer_similarity,answer_correctness,harmfulness
0,Who discovered the Galapagos Islands and how?,The context does not mention anything about th...,"[of the Hudson River, which he named Río de Sa...",The Galapagos Islands were discovered in 1535 ...,1.0,0.0,0.0,0.0,0.0,0.831708,0.207927,0
1,What is Brooklyn–Battery Tunnel?,"The Brooklyn-Battery Tunnel, now known as the ...",[Park Service. Archived from the original on J...,The Brooklyn-Battery Tunnel (officially known ...,1.0,0.820663,1.0,0.5,0.5,0.983916,0.695979,0
2,Are Penguins found in the Galapagos Islands?,"Yes, penguins can be spotted on the volcanic i...",[Learn more about each species of penguin and ...,Penguins live on the galapagos islands side by...,1.0,0.986037,1.0,1.0,0.0,0.914244,0.978561,0
3,How many languages are spoken in New York?,As many as 800 languages are spoken in New Yor...,"[2017. ""The immigrant share of the population ...",As many as 800 languages are spoken in New York.,1.0,0.992015,1.0,1.0,1.0,0.944669,0.736167,0
4,In which countries are snow leopards found?,"Snow leopards are found in southern Siberia, T...","[Baikal through southern Siberia, in the Kunlu...","Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, K...",0.923077,0.968004,1.0,1.0,0.666667,0.872611,0.905653,0
5,What are the threats to penguin populations?,The threats to penguin populations include hab...,[ice or huddle together for warmth will melt t...,"The threats are numerous, including habitat lo...",1.0,1.0,1.0,1.0,0.25,0.958392,0.864598,0
6,What is the economic significance of New York ...,The economic significance of New York City is ...,"[metropolitan economy, with a gross metropolit...","New York City's economic significance is vast,...",0.954545,1.0,1.0,1.0,0.1,0.95987,0.972526,0
7,How did New York City get its name?,New York City was temporarily renamed New York...,[city in 1653. The city came under English con...,New York City got its name when it came under ...,1.0,0.876145,1.0,1.0,0.5,0.958414,0.614604,0
8,How did Galapagos Islands get its name?,The context does not mention how the Galapagos...,"[lynx), is where the Latin name uncia and the ...","Tomás de Berlanga, who discovered the islands,...",1.0,0.0,0.0,0.0,0.0,0.837755,0.209439,0
9,What is the significance of the Statue of Libe...,The Statue of Liberty was a reassuring sign fo...,"[States ever since."" ^ The Immigrant's Statue...",The Statue of Liberty in New York City holds g...,1.0,0.917498,1.0,0.333333,0.333333,0.913051,0.603263,0
