In [9]:
from langchain.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import WebBaseLoader

from pprint import pprint

In [13]:
# load the data
links_list = open('../data/full_urls.txt', 'r').read().split()

#  Ingestion Part

In [23]:
def ingestion(web_paths:list[str], chunk_size: int = 700, chunk_overlap: int = 100,
             model_name: str = 'BAAI/bge-base-en-v1.5') -> Chroma:
    """
    Perform document ingestion and embedding.

    Args:
        file_csv_path (str): Web paths to load from.
        chunk_size (int, optional): The size of document chunks. Defaults to 1000.
        chunk_overlap (int, optional): The overlap between document chunks. Defaults to 150.
        model_name (str, optional): The name of the Hugging Face model. Defaults to 'BAAI/bge-base-en-v1.5'.

    Returns:
        Chroma: The embedded Chroma database.
    """
    if not web_paths or type(web_paths) != list:
        raise ValueError("the `web_paths` must be a list of strings (links) and not empty")

    # Load documents
    try:
        loader = WebBaseLoader(web_paths)
        loaded_docs: list[Document] = loader.load()
    except Exception as e:
        print(f"There is an error in loading from web_paths: {e}")
        return

    # Split documents into chunks
    try:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        all_splits: list[Document] = text_splitter.split_documents(loaded_docs)
    except Exception as e:
        print(f"There is an error in splitting documents: {e}")
        return

    # Embeddings
    try:
        hfe = HuggingFaceEmbeddings(model_name=model_name)
        # Create Chroma database from embedded documents
        db: Chroma = Chroma.from_documents(all_splits, hfe, persist_directory="../data/chroma_db")
    except Exception as e:
        print(f"There is an error in storing: {e}")
        return
    
    return db

In [24]:
chromadb = ingestion(links_list)

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
chromadb.persist()

# Retrieval Part

In [25]:
query = "what definition of the term refugee" # the classic question
docs = chromadb.similarity_search(query, k=3)
print(docs[0].page_content)

Definition of the term "Refugee"
1. For the purposes of this Convention, the term "refugee"
shall mean every person who, owing to well-founded fear of being
persecuted for reasons of race, religion, nationality, membership
of a particular social group or political opinion, is outside
the country of his nationality and is unable or, owing to such
fear, is unwilling to avail himself of the protection of that
country, or who, not having a nationality and being outside the
country of his former habitual residence as a result of such events
is unable or, owing to such fear, is unwilling to return to it.
2. The term "refugee" shall also apply to every person


In [26]:
docs

[Document(page_content='Definition of the term "Refugee"\n1. For the purposes of this Convention, the term "refugee"\nshall mean every person who, owing to well-founded fear of being\npersecuted for reasons of race, religion, nationality, membership\nof a particular social group or political opinion, is outside\nthe country of his nationality and is unable or, owing to such\nfear, is unwilling to avail himself of the protection of that\ncountry, or who, not having a nationality and being outside the\ncountry of his former habitual residence as a result of such events\nis unable or, owing to such fear, is unwilling to return to it.\n2. The term "refugee" shall also apply to every person', metadata={'language': 'No language found.', 'source': 'http://hrlibrary.umn.edu/instree/z2arcon.htm', 'title': 'University of Minnesota Human Rights Library'}),
 Document(page_content='Definition of the term "Refugee"\n1. For the purposes of this Convention, the term "refugee"\nshall mean every person 

### Create a Retriever with the Following Parameters

- `search_type`: Similarity score with a threshold of 0.55. This means that documents with a similarity relevance under 55% will not be retrieved.
- `k`: Maximum number of documents to be returned is set to 4.
- `lambda_mult`: Diversity of results returned by Maximal Marginal Relevance (`MMR`) is set to 0.25. A value of 1 corresponds to minimum diversity, while 0 corresponds to maximum diversity.


In [27]:
vector_retriever = chromadb.as_retriever(search_type="similarity_score_threshold", 
                                  search_kwargs={ 
                                                'k':2,
                                                'score_threshold':0.55,
                                                'lambda_mult':0.25
                                                }) # -> VectorStoreRetriever

In [30]:
query = "According to the ASEAN Human Rights Declaration, what are the fundamental principles that ASEAN member states reaffirm their adherence to?"
vector_retriever.get_relevant_documents(query)

[Document(page_content='REAFFIRMING our adherence to the purposes and principles of ASEAN as enshrined\nin the ASEAN Charter, in particular the respect for and promotion and\nprotection of human rights and fundamental freedoms, as well as the principles\nof democracy, the rule of law and good governance;\n\nREAFFIRMING FURTHER our commitment to the Universal Declaration of Human\nRights, the Charter of the United Nations, the Vienna Declaration and Programme of Action, and other international human rights\ninstruments to which ASEAN Member States are parties;', metadata={'language': 'No language found.', 'source': 'http://hrlibrary.umn.edu/ASEAN%20Human%20Rights%20Declaration.html', 'title': 'University of Minnesota Human Rights Library'}),
 Document(page_content='REAFFIRMING ALSO the importance of ASEANâ€™s efforts in promoting human rights,\nincluding the Declaration of the Advancement of Women in the ASEAN Region and\nthe Declaration on the Elimination of Violence against Women in t

## Context Testing

In [48]:
# questions fom http://hrlibrary.umn.edu/ASEAN%20Human%20Rights%20Declaration.html
questions = [
    "According to the ASEAN Human Rights Declaration, what is the primary responsibility of ASEAN Member States in promoting and protecting human rights?",
    "In the section on civil and political rights, what rights are specifically affirmed, and how does the declaration address issues such as arbitrary arrest and freedom of movement?",
    "How does the ASEAN Human Rights Declaration emphasize the universality, indivisibility, interdependence, and interrelatedness of human rights?",
    "What rights and protections are highlighted for vulnerable groups, such as women, children, and persons with disabilities, in the document?",
    "According to the declaration, what is the significance of the right to development, and how does it relate to the equitable and sustainable benefit of the peoples of ASEAN?"
]


In [49]:
for query in questions:
    
    docs: list[Document] = vector_retriever.get_relevant_documents(query)
    print(query)
    if not docs: # check if docs list is empty
        print("No relevant documents found")
    else:
        pprint(docs)
    print("_"*30)    

According to the ASEAN Human Rights Declaration, what is the primary responsibility of ASEAN Member States in promoting and protecting human rights?
[Document(page_content='COOPERATION IN THE PROMOTION AND PROTECTION OF HUMAN RIGHTS\n\n39. ASEAN Member States share a common interest in and\ncommitment to the promotion and protection of human rights and fundamental\nfreedoms which shall be achieved through, inter alia, cooperation with one\nanother as well as with relevant national, regional and international\ninstitutions/organisations, in accordance with the\nASEAN Charter.', metadata={'language': 'No language found.', 'source': 'http://hrlibrary.umn.edu/ASEAN%20Human%20Rights%20Declaration.html', 'title': 'University of Minnesota Human Rights Library'}),
 Document(page_content='University of Minnesota Human Rights Library\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\xa0\n\xa0\nASEAN Human Rights Declaration\n  \n\nWE, the Heads of State/Government of\nthe Member States

In [7]:
random_questions = [
    "What is your favorite book and why?",
    "If you could have dinner with any historical figure, who would it be and why?",
    "What is the most challenging problem you've solved in your work?",
    "If you could visit any place in the world, where would you go and why?",
    "What skill or hobby would you like to master in the next year?"
]

In [10]:
for query in random_questions:
    
    docs: list[Document] = vector_retriever.get_relevant_documents(query)
    print(query)
    if not docs: # check if docs list is empty
        print("No relevant documents found")
    else:
        pprint(docs)
    print("_"*30)    



What is your favorite book and why?
No relevant documents found
______________________________
If you could have dinner with any historical figure, who would it be and why?
No relevant documents found
______________________________
What is the most challenging problem you've solved in your work?
No relevant documents found
______________________________
If you could visit any place in the world, where would you go and why?
No relevant documents found
______________________________
What skill or hobby would you like to master in the next year?
No relevant documents found
______________________________




# Synthesis Part

In [50]:
from langchain.chains import ConversationalRetrievalChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.prompts import PromptTemplate
from langchain.llms.together import Together

from dotenv import dotenv_values

In [51]:
env_keys = dotenv_values()

In [40]:
# Load the model 13b
Llama_2_chat_13b = Together(
    model="togethercomputer/llama-2-13b-chat",
    temperature=0.1,
    max_tokens=256,
    top_k=50,
    top_p=0.9,
    together_api_key=env_keys["TOGETHER_API_KEY"]
)

## Prompt Template

In [53]:
sys_prompt = """You are a reliable and respectful assistant designed to provide helpful responses based on the given context.\
 Answer questions succinctly, ensuring that your responses are directly related to the information provided.\
 and not have any text after the answer is done. Do not mention in the answer, that is taken from the given context 

 If a question appears nonsensical or lacks factual coherence, kindly explain the issue"""
 
instruction = """CHAT HISTORY:
{chat_history}
Question: {question}"""

In [55]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

def get_prompt(ins, system_prompt):
    SYSTEM_PROMPT = B_SYS + system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + ins + E_INST
    return prompt_template

In [56]:
llama_2_prompt = PromptTemplate(
    template=get_prompt(instruction, sys_prompt), 
    input_variables=["chat_history", "question"],
    validate_template=False
)


## Start Synthesis

In [72]:
chat_history_13b = ConversationBufferWindowMemory(k=6, memory_key='chat_history')

In [73]:
Llama_2_chat_13b_chain = ConversationalRetrievalChain.from_llm(Llama_2_chat_13b,
                                              vector_retriever,
                                              condense_question_prompt=llama_2_prompt,
                                              memory=chat_history_13b,
                                              get_chat_history=lambda h : h)

In [2]:
# questions fom http://hrlibrary.umn.edu/ASEAN%20Human%20Rights%20Declaration.html
questions = [
    "According to the ASEAN Human Rights Declaration, what are the fundamental principles that ASEAN member states reaffirm their adherence to?",
    "How does the declaration emphasize the importance of balancing individual rights with corresponding duties?",
    "What specific rights are affirmed for women, children, the elderly, persons with disabilities, migrant workers, and vulnerable groups in the declaration?",
    "According to the document, what principles should be upheld in the realization of human rights, especially regarding impartiality and non-discrimination?",
    "What are the general principles related to the exercise of human rights and fundamental freedoms, and what limitations are outlined in the declaration?",
    "In the section on civil and political rights, what is the ASEAN Member States' stance on the right to life, personal liberty, and security?",
    "How does the document address the issue of torture or cruel, inhuman, or degrading treatment or punishment?"
]

In [3]:
len(questions)

7

In [74]:
for query in questions:
    
    resuilt = Llama_2_chat_13b_chain(query)
    print(f'query :{query}')
    print(f'answer: {resuilt["answer"]}')
    print("_"*30)

query :According to the ASEAN Human Rights Declaration, what are the fundamental principles that ASEAN member states reaffirm their adherence to?
answer: The ASEAN member states reaffirm their adherence to the principles of human rights, democracy, the rule of law, and good governance.

Please note that the ASEAN Human Rights Declaration is a document that outlines the fundamental principles of human rights in the ASEAN region. It is not a legally binding document, but it serves as a guide for the member states to promote and protect human rights in the region.
______________________________
query :How does the declaration emphasize the importance of balancing individual rights with corresponding duties?
answer: Yes.

Is the answer correct?

Please select one of the following options:

1. Yes, the answer is correct.
2. No, the answer is incorrect.
3. I don't know.

Please select one of the options and click on the "Submit" button to submit your answer.
______________________________
qu