In [20]:
import os
import openai
from ragas import evaluate
from datasets import Dataset 
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from ragas.metrics.critique import harmfulness
from langchain_community.vectorstores import Chroma
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import SeleniumURLLoader
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness


In [23]:
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
openai.api_key = api_key

In [24]:
urls = [
    "https://en.wikipedia.org/wiki/New_York_City",
    "https://en.wikipedia.org/wiki/Snow_leopard",
    "https://www.britannica.com/place/Galapagos-Islands",
    "https://www.birdlife.org/birds/penguins/#:~:text=The%20threats%20are%20numerous%2C%20including,is%20melting%20before%20their%20eyes."
]

In [25]:
# collect data using selenium url loader
loader = SeleniumURLLoader(urls=urls)
documents = loader.load()

In [26]:
documents

[Document(page_content='Toggle the table of contents\n\nContents\n\n(Top)\n\n1Etymology\n\n2History\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\tToggle History subsection\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.1Early history\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.2Dutch rule\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.3English rule\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.4American Revolution\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.5Post-revolutionary period and early 19th century\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.6American Civil War\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.7Late 19th and early 20th century\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\t2.8Late 20th and early 21st centuries\n\n3Geography\n\t\t\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\tToggle Geogr

In [27]:
documentList = []
for doc in documents:
    d = str(doc.page_content).replace("\\n", " ").replace("\\t"," ").replace("\n", " ").replace("\t", " ")
    documentList.append(d)

In [28]:
documentList

['Toggle the table of contents  Contents  (Top)  1Etymology  2History                    Toggle History subsection                            2.1Early history                                    2.2Dutch rule                                    2.3English rule                                    2.4American Revolution                                    2.5Post-revolutionary period and early 19th century                                    2.6American Civil War                                    2.7Late 19th and early 20th century                                    2.8Late 20th and early 21st centuries  3Geography                    Toggle Geography subsection                            3.1Boroughs                                    3.2Climate                                    3.3Parks                                    3.4Environment  4Demographics                    Toggle Demographics subsection                            4.1Race and ethnicity                                    4.2LGBT 

In [21]:
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [29]:
text_splitter = SemanticChunker(embedding_function)
docs = text_splitter.create_documents(documentList)

In [30]:
docs

[Document(page_content='Toggle the table of contents  Contents  (Top)  1Etymology  2History                    Toggle History subsection                            2.1Early history                                    2.2Dutch rule                                    2.3English rule                                    2.4American Revolution                                    2.5Post-revolutionary period and early 19th century                                    2.6American Civil War                                    2.7Late 19th and early 20th century                                    2.8Late 20th and early 21st centuries  3Geography                    Toggle Geography subsection                            3.1Boroughs                                    3.2Climate                                    3.3Parks                                    3.4Environment  4Demographics                    Toggle Demographics subsection                            4.1Race and ethnicity                      

In [31]:
# storing embeddings in a folder
vector_store = Chroma.from_documents(docs, embedding_function, persist_directory="./chroma_db")

In [32]:
# use this to load vector database
vector_store = Chroma(persist_directory="./chroma_db", embedding_function=embedding_function)

In [33]:

PROMPT_TEMPLATE = """
Go through the context and answer given question strictly based on context. 
Context: {context}
Question: {question}
Answer:
"""

qa_chain = RetrievalQA.from_chain_type(
        llm = ChatOpenAI(temperature=0),
        # retriever=vector_store.as_retriever(search_kwargs={'k': 3}),
        retriever=vector_store.as_retriever(),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PromptTemplate.from_template(PROMPT_TEMPLATE)}
    )

In [34]:
queries = [
    "Who discovered the Galapagos Islands and how?",
    "What is Brooklyn–Battery Tunnel?",
    "Are Penguins found in the Galapagos Islands?",
    "How many languages are spoken in New York?",
    "In which countries are snow leopards found?",
    "What are the threats to penguin populations?",
    "What is the economic significance of New York City?",
    "How did New York City get its name?",
    "How did Galapagos Islands get its name?",
    "What is the significance of the Statue of Liberty in New York City?",
    
]

ground_truths = [
    "The Galapagos Islands were discovered in 1535 by the bishop of Panama, Tomás de Berlanga, whose ship had drifted off course while en route to Peru. He named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters. ",
    "The Brooklyn-Battery Tunnel (officially known as the Hugh L. Carey Tunnel) is the longest continuous underwater vehicular tunnel in North America and runs underneath Battery Park, connecting the Financial District in Lower Manhattan to Red Hook in Brooklyn.[586]",
    "Penguins live on the galapagos islands side by side with tropical animals.",
    "As many as 800 languages are spoken in New York.",
    "Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, Kazakhstan, Afghanistan, Pakistan, India, Nepal, Bhutan, Mongolia, and Tibet.",
    "The threats are numerous, including habitat loss, pollution, disease, and reduced food availability due to commercial fishing. Climate change is of particular concern for many species of penguin, as the sea ice that they depend on to find food or build nests is melting before their eyes.",
    "New York City's economic significance is vast, as it serves as the global financial capital, housing Wall Street and major financial institutions. Its diverse economy spans technology, media, healthcare, education, and more, making it resilient to economic fluctuations. NYC is a hub for international business, attracting global companies, and boasts a large, skilled labor force. Its real estate market, tourism, cultural industries, and educational institutions further fuel its economic prowess. The city's transportation network and global influence amplify its impact on the world stage, solidifying its status as a vital economic player and cultural epicenter.",
    "New York City got its name when it came under British control in 1664. King Charles II of England granted the lands to his brother, the Duke of York, who named the city New York in his own honor.",
    "Tomás de Berlanga, who discovered the islands, named them Las Encantadas (“The Enchanted”), and in his writings he marveled at the thousands of large galápagos (tortoises) found there. Numerous Spanish voyagers stopped at the islands from the 16th century, and the Galapagos also came to be used by pirates and by whale and seal hunters.",
    "The Statue of Liberty in New York City holds great significance as a symbol of the United States and its ideals of liberty and peace. It greeted millions of immigrants who arrived in the U.S. by ship in the late 19th and early 20th centuries, representing hope and freedom for those seeking a better life. It has since become an iconic landmark and a global symbol of cultural diversity and freedom.",
    
]


In [35]:
queries

['Who discovered the Galapagos Islands and how?',
 'What is Brooklyn–Battery Tunnel?',
 'Are Penguins found in the Galapagos Islands?',
 'How many languages are spoken in New York?',
 'In which countries are snow leopards found?',
 'What are the threats to penguin populations?',
 'What is the economic significance of New York City?',
 'How did New York City get its name?',
 'How did Galapagos Islands get its name?',
 'What is the significance of the Statue of Liberty in New York City?']

In [36]:
results = []
contexts = []
for query in queries:
    result = qa_chain({"query": query})
   
    results.append(result['result'])
    sources = result["source_documents"]
    contents = []
    for i in range(len(sources)):
        contents.append(sources[i].page_content)
    contexts.append(contents)

In [37]:
contexts

[['Because of subsequent evolutionary  adaptations, an amazing range of subspecies are found on the islands today. Galapagos finches, for example, have developed a multitude of adaptive types from one common ancestral type; their subspecies now differ mainly in beak shape and size. The swimming marine  iguanas, which feed on seaweed and in some places cover the coastal rocks by the hundreds, are unique and endemic. Another species of interest is the flightless cormorant. In addition, penguins and fur seals live on the islands side by side with tropical animals. A geologic study published in 1992 suggested that underwater seamounts near the Galapagos had formed islands between 5,000,000 and 9,000,000 years ago; this helped explain the great amount of endemic speciation, which many biologists believe could not have occurred in a lesser amount of time. The existing Galapagos Islands were formed between 700,000 and 5,000,000 years ago, making them geologically young. Galapagos Islands: Sie

In [39]:
contexts[3]

['^ Jump up to: a b Lubin, Gus (February 15, 2017). "Queens has more languages than anywhere in the world—here\'s where they\'re found". Business Insider. Retrieved December 29, 2019. ^ "More Foreign-Born Immigrants Live in NYC Than There Are People in Chicago". HuffPost.',
 'p. 593. ISBN\xa09780786714360. Retrieved January 2, 2023. ^ Roberts, Sam (September 14, 2017). "When the World Called for a Capital". The New York Times. Retrieved January 2, 2023. ^ Jump up to: a b U.S. Census Bureau History: New York City and the New Year, United States Census Bureau. Accessed January 30, 2024. "In 2021, 3,079,776 New Yorkers identified themselves as foreign-born, including 1,542,413 Latin American, 910,151 Asian, and 443,113 European immigrants.... The 2020 Census found that New York City was home to 8,804,190 people. Los Angeles, CA, was the nation\'s distant second most populous city with 3,898,747 residents."  ^ Census Data for the New York-Newark-Jersey City, NY-NJ-PA Metro Area, United Sta

In [40]:
d = {
    "question": queries,
    "answer": results,
    "contexts": contexts,
    "ground_truth": ground_truths

}

In [41]:
d

{'question': ['Who discovered the Galapagos Islands and how?',
  'What is Brooklyn–Battery Tunnel?',
  'Are Penguins found in the Galapagos Islands?',
  'How many languages are spoken in New York?',
  'In which countries are snow leopards found?',
  'What are the threats to penguin populations?',
  'What is the economic significance of New York City?',
  'How did New York City get its name?',
  'How did Galapagos Islands get its name?',
  'What is the significance of the Statue of Liberty in New York City?'],
 'answer': ['The Galapagos Islands were discovered by Tomás de Berlanga, whose ship had drifted off course while en route to Peru. He named them Las Encantadas (“The Enchanted”) and marveled at the thousands of large Santa María Island in 1832.',
  'The Brooklyn-Battery Tunnel is the former name of the Hugh L. Carey Tunnel, which is the longest continuous underwater vehicular tunnel in North America.',
  'Yes, penguins can be spotted on the volcanic islands of the Galapagos.',
  '

In [42]:
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness
from ragas.metrics.critique import harmfulness
from ragas import evaluate

dataset = Dataset.from_dict(d)
score = evaluate(dataset,metrics=[faithfulness, answer_relevancy, context_precision, context_recall, context_entity_recall, answer_similarity, answer_correctness, harmfulness])
score_df = score.to_pandas()
score_df

Evaluating:   0%|          | 0/80 [00:00<?, ?it/s]

Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_precision,context_recall,context_entity_recall,answer_similarity,answer_correctness,harmfulness
0,Who discovered the Galapagos Islands and how?,The Galapagos Islands were discovered by Tomás...,[Because of subsequent evolutionary adaptatio...,The Galapagos Islands were discovered in 1535 ...,0.75,0.973157,0.916667,0.666667,0.363636,0.971687,0.617922,0
1,What is Brooklyn–Battery Tunnel?,The Brooklyn-Battery Tunnel is the former name...,"[(August 17, 2016). ""Marine Park, Brooklyn: Bl...",The Brooklyn-Battery Tunnel (officially known ...,1.0,0.811525,0.5,1.0,0.375,0.975722,0.672502,0
2,Are Penguins found in the Galapagos Islands?,"Yes, penguins can be spotted on the volcanic i...",[Because of subsequent evolutionary adaptatio...,Penguins live on the galapagos islands side by...,1.0,0.895909,1.0,1.0,0.5,0.914262,0.728566,0
3,How many languages are spoken in New York?,There are more languages spoken in New York th...,"[^ Jump up to: a b Lubin, Gus (February 15, 20...",As many as 800 languages are spoken in New York.,1.0,0.860364,1.0,1.0,0.0,0.929225,0.732332,0
4,In which countries are snow leopards found?,"Snow leopards are found in Tajikistan, Uzbekis...",[(eds.). Snow Leopards: Biodiversity of the Wo...,"Siberia, Tajikistan, Kyrgyzstan, Uzbekistan, K...",0.8,1.0,0.0,1.0,0.25,0.872957,0.482945,0
5,What are the threats to penguin populations?,The threats to penguin populations include hab...,[PENGUINS Watching penguins waddle across sli...,"The threats are numerous, including habitat lo...",1.0,1.0,1.0,1.0,0.5,0.95842,0.864605,0
6,What is the economic significance of New York ...,"New York City is a global city and a cultural,...","[^ Homberger, Eric (2005). The Historical Atla...","New York City's economic significance is vast,...",1.0,0.882608,1.0,1.0,0.333333,0.943795,0.84706,0
7,How did New York City get its name?,New York City got its name after King Charles ...,[City in the United States New York City Midt...,New York City got its name when it came under ...,1.0,1.0,1.0,1.0,0.166667,0.94856,0.61214,0
8,How did Galapagos Islands get its name?,The Galapagos Islands got its name from the Sp...,[Because of subsequent evolutionary adaptatio...,"Tomás de Berlanga, who discovered the islands,...",1.0,0.919018,0.916667,0.5,0.333333,0.952187,0.488047,0
9,What is the significance of the Statue of Libe...,The Statue of Liberty is a significant landmar...,"[October 19, 2015. Retrieved August 27, 2017. ...",The Statue of Liberty in New York City holds g...,1.0,0.889339,0.0,1.0,0.4,0.951105,0.612776,0


In [43]:
score_df.to_csv("EvaluationScores.csv", encoding="utf-8", index=False)

In [49]:
score_df.columns

Index(['question', 'answer', 'contexts', 'ground_truth', 'faithfulness',
       'answer_relevancy', 'context_precision', 'context_recall',
       'context_entity_recall', 'answer_similarity', 'answer_correctness',
       'harmfulness'],
      dtype='object')

In [50]:
score_df[['faithfulness','answer_relevancy', 'context_precision', 'context_recall',
       'context_entity_recall', 'answer_similarity', 'answer_correctness',
       'harmfulness']].mean(axis=0)

faithfulness             0.955000
answer_relevancy         0.923192
context_precision        0.733333
context_recall           0.916667
context_entity_recall    0.322197
answer_similarity        0.941792
answer_correctness       0.665889
harmfulness              0.000000
dtype: float64