LANGCHAIN: https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/<br>
> We have removed our default folder locations and openAI secret key. Instead we have added temporary placeholder values in those variables.   





# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentence-transformers -q
!pip install --upgrade --quiet  langchain langchain-community langchainhub langchain-openai langchain-chroma bs4 streamlit
!pip install -U deep-translator
!pip install langdetect

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.7/224.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.2/310.2 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.3/124.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
# Step 0
import re
import os

# Step 1
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader

# Step 2
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Step 3
from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
from langchain_chroma import Chroma

# Step 4
import os
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

# Step 5
from deep_translator import GoogleTranslator
from langdetect import detect

# Step 0 : Clean Data

In this section, we load our raw abstracts, clean unnecessary information, and store each abstract separately in the "cleaned abstracts" folder. The steps are as follows:

1. Load the raw data.
2. Split each abstract by its header.
3. Remove unnecessary keywords.
4. Save the cleaned abstract in a text file.




Reads the entire content of a file and returns it as a string.

In [None]:
def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

Splits the text into abstracts, removes specific keywords and extra whitespace.

In [None]:
def split_and_clean_abstracts(text):

    if not text:
        return []

    # Splitting text into abstracts at '###' followed by digits
    abstracts = re.split(r'\n*###\d+\n*', text)
    # Stripping whitespace and ignoring empty entries
    abstracts = [abstract.strip() for abstract in abstracts if abstract.strip()]

    # Keywords to remove from each abstract
    keywords = ['BACKGROUND', 'OBJECTIVE', 'METHODS', 'RESULTS', 'CONCLUSIONS']
    cleaned_abstracts = []

    for abstract in abstracts:
        for keyword in keywords:
            abstract = re.sub(rf'\b{keyword}\b', '', abstract)
        # Removing tabs and extra spaces
        abstract = abstract.replace('\t', '').strip()
        cleaned_abstracts.append(abstract)

    return cleaned_abstracts

Saves each abstract into a separate file in the specified folder.

In [None]:
def save_abstracts(abstracts, folder, prefix):
    for idx, abstract in enumerate(abstracts):
        file_path = os.path.join(folder, f'{prefix}{idx+1}.txt')
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(abstract)

Reads, processes, and saves cleaned abstracts from a given file path.

In [None]:
def process_abstracts(file_path, output_folder, prefix):

    text = read_file(file_path)
    if text is not None:
        cleaned_abstracts = split_and_clean_abstracts(text)
        save_abstracts(cleaned_abstracts, output_folder, prefix)
    else:
        print(f"Skipping processing for {file_path} due to read error.")

Processing all the given Files

In [None]:
raw_data_loc = "ENTER LOCATION OF RAW FOLDER HERE"
clean_data_loc = "ENTER LOCATION OF CLEAN FOLDER HERE"

for example:
```
raw_data_loc = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/Data/raw_data"
clean_data_loc = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/Data/clean_data"
```

In [None]:
process_abstracts(f'{raw_data_loc}/dev.txt', clean_data_loc, 'dev')
process_abstracts(f'{raw_data_loc}/test.txt', clean_data_loc, 'test')
process_abstracts(f'{raw_data_loc}/train.txt', clean_data_loc, 'train')

# Step 1: Load Data

In this section, we load our cleaned data into the LangChain loader. This process converts the data into a doc format, which will be used in subsequent steps to vectorize our database.

In [None]:
# Enter Folder Location here
clean_data_loc = "ENTER LOCATION OF CLEAN FOLDER HERE"

Initializing the Loader variable with all the text files

In [None]:
loader = DirectoryLoader(clean_data_loc, glob="*.txt", loader_cls=TextLoader)

Converting text files into doc format by using employing the loader variable from before

In [None]:
docs = loader.load()

Viewing Some Statistics

In [None]:
print("Total Number of Documents:\t\t", len(docs))
print("Sample Content on a doc:\t\t", docs[0].page_content[:500])

Total Number of Documents:		 20185
Sample Content on a doc:		 This study examined the effects of an 8-week stress reduction program based on training in mindfulness meditation .
Previous research efforts suggesting this program may be beneficial in terms of reducing stress-related symptomatology and helping patients cope with chronic pain have been limited by a lack of adequate comparison control group .
Twenty-eight individuals who volunteered to participate in the present study were randomized into either an experimental group or a nonintervention contro


# Step 2: Data Split

In this section, we split our documents into manageable chunks to ensure they can be effectively vectorized for our retriever.

In [None]:
# Initialize the text splitter by setting a chunk size and other parameters
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

Splitting the documents using our text splitter

In [None]:
all_splits = text_splitter.split_documents(docs)

Viewing Some Statistics

In [None]:
print("Total Number of Splits:\t\t", len(all_splits))
print("Structure Format:\t\t", all_splits[10].metadata)

Total Number of Splits:		 49270
Structure Format:		 {'source': '/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/Data/clean_data/test270.txt', 'start_index': 843}


# Step 3: Information Retrieval
- 3.1 -
In this section, we vectorize or embed our chunks of documents. This embedding represents the documents in a 3-dimensional space. The quality of the embedding model directly influences the accuracy of the 3D map of all our documents.

- 3.2 -
During information retrieval, we embed our query and then extract the documents that are closest to our query embedding.

---

How To use this part:<br>
> DO NOT RUN and overwrite the Chroma_db vectors. The complete dataset, including Train, Test, and Dev sets, has already been embedded and stored in Chroma_db. Simply download the Chroma_db folder, update the folder location in the code below, and load the embeddings. Note that this embedding process has been done on my local machine as it takes time on Colab.



3.1 - Vectorizing / Embedding our Documents

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# # load it into Chroma
# vectorstore = Chroma.from_documents(all_splits, embedding_function, persist_directory="./chroma_db")

In [None]:
# load from Chroma
chroma_db_location = "ENTER LOCATION OF CHROMA DB FOLDER HERE"

vectorstore = Chroma(persist_directory=chroma_db_location, embedding_function=embedding_function)

for example:
```
chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/chroma_db/"
eval_chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/eval_chroma_db/"
```

3.2 - Creating the Information Retriever

In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

# Step 4: RAG using GPT3.5-Turbo

In this section, we create our Retrieval-Augmented Generation (RAG) chain using our Language Model (LLM) and retriever. RAG involves passing a query to the LLM along with the relevant context extracted by the retriever. We compile and chain the retriever with our LLM, preparing it for use as a RAG chain.

-----
Something to Note:

>We tested GPT-3.5-turbo against GPT-4. While GPT-4 performed better, it is also more costly. Additionally, GPT-4 sometimes provides answers that include information not extracted by the retriever, making it more prone to hallucinations compared to GPT-3.5-turbo. Therefore, we are using GPT-3.5-turbo here.

LLM setup

In [None]:
os.environ["OPENAI_API_KEY"] = "ENTER OPENAI API KEY HERE"
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

Information Retriever Setup

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load from Chroma
chroma_db_location = "ENTER LOCATION OF CHROMA DB FOLDER HERE"

vectorstore = Chroma(persist_directory=chroma_db_location, embedding_function=embedding_function)

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})



Implementing RAG

In [None]:
def format_docs(docs):
  return "\n\n".join(doc.page_content for doc in docs)


def generate_response(question):
  template = """Use the following pieces of context to answer the question at the end.
  If you don't know the answer, just say that you don't know, don't try to make up an answer.

  {context}

  Question: {question}

  Helpful Answer:"""
  custom_rag_prompt = PromptTemplate.from_template(template)

  rag_chain = (
      {"context": retriever | format_docs, "question": RunnablePassthrough()}
      | custom_rag_prompt
      | llm
      | StrOutputParser()
  )

  response = rag_chain.invoke(question)
  return response

# Step 5 : Translating Prompts and Reponses

Here we use Google Translate to translate text to and from English. First, we detect the language of the input text. If the detected language is not English, we proceed to translate it to English and then back to the original language.

In [None]:
def translate_to_eng(text):
  translation = GoogleTranslator(source='auto', target='en').translate(text)
  return translation

In [None]:
def translate_from_eng(text, lang):
  translation= GoogleTranslator(source='en', target=lang).translate(text)
  return translation

**Layering MT over RAG**

In [None]:
def ask_RAG(prompt):
  detected_language = detect(prompt)

  if detected_language == 'en':
    response = generate_response(prompt)
  else:
    translated_prompt = translate_to_eng(prompt)
    print(f"[LOG] Question: {translated_prompt}")
    response = generate_response(translated_prompt)
    print(f"[LOG] Response: {response}")
    response = translate_from_eng(response, detected_language)

  return response

**Testing it out**

In [None]:
prompt = "میرا دل غیر معمولی طور پر دھڑک رہا ہے اور مجھے پسینہ کیوں آرہا ہے؟"
ask_RAG(prompt)

[LOG] Question: Why is my heart beating abnormally and I'm sweating?
[LOG] Response: Based on the context provided, the abnormal heartbeat and sweating may be due to arrhythmogenic right ventricular cardiomyopathy (ARVC), which is a form of heart disease that can cause symptoms such as irregular heartbeat and sweating. It is important to consult with a healthcare professional for an accurate diagnosis and appropriate treatment.


'فراہم کردہ سیاق و سباق کی بنیاد پر، دل کی غیر معمولی دھڑکن اور پسینہ آنا arrhythmogenic right ventricular cardiomyopathy (ARVC) کی وجہ سے ہو سکتا ہے، جو کہ دل کی بیماری کی ایک شکل ہے جو دل کی بے قاعدگی اور پسینہ جیسی علامات کا سبب بن سکتی ہے۔ درست تشخیص اور مناسب علاج کے لیے ہیلتھ کیئر پروفیشنل سے مشورہ کرنا ضروری ہے۔'

In [None]:
prompt = "Bagaimana mekanisme penularan COVID-19?"
ask_RAG(prompt)

[LOG] Question: What is the mechanism of transmission of COVID-19?
[LOG] Response: The information provided does not mention anything about the mechanism of transmission of COVID-19. Therefore, it is not possible to provide an answer based on the given context.


'Informasi yang diberikan tidak menyebutkan apapun mengenai mekanisme penularan COVID-19. Oleh karena itu, tidak mungkin memberikan jawaban berdasarkan konteks yang diberikan.'

# Step 6: Evaluation RAG-Chain

In this section, we construct a Retrieval-Augmented Generation (RAG) chain that utilizes a retriever trained only on our test passages and a very small subset of abstracts. This approach allows us to accurately evaluate the performance of the retriever and the entire RAG chain.

Using the old retriever for evaluation is not feasible because there is a significant data imbalance: there are only 10 passages, whereas the total number of abstracts is in the hundreds of thousands. This large imbalance would prevent an accurate evaluation.

**Loading and Splitting**

In [None]:
test_passage_folder_loc = "Enter Test Passages Folder Location Here"

for example:
```
test_passage_folder_loc = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/Data/test_passages/"
```

In [None]:
eval_loader = DirectoryLoader(test_passage_folder_loc, glob="*.txt", loader_cls=TextLoader)

In [None]:
eval_docs = eval_loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

eval_all_splits = text_splitter.split_documents(eval_docs)

**Vectorizing**

In [None]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")



In [None]:
# # load the evluation data into Chroma
# eval_vectorstore = Chroma.from_documents(eval_all_splits, embedding_function, persist_directory="./eval_chroma_db")

In [None]:
# load from eval_chroma_db
eval_chroma_db_location = "ENTER LOCATION OF EVAL CHROMA DB FOLDER HERE"

eval_vectorstore = Chroma(persist_directory=eval_chroma_db_location, embedding_function=embedding_function)

for example:
for example:
```
eval_chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/eval_chroma_db/"
```

In [None]:
eval_retriever = eval_vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

**Chaining Together**

In [None]:
os.environ["OPENAI_API_KEY"] = "ENTER OPENAI API KEY HERE"
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def eval_generate_response(question):
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}

    Helpful Answer:"""
    custom_rag_prompt = PromptTemplate.from_template(template)

    rag_chain = (
        {"context": eval_retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(question)
    return response

**Testing it out**

example of testing:

In [None]:
eval_query = "Who is at risk for Alpha-1 Antitrypsin Deficiency?"

In [None]:
eval_retriever.invoke(eval_query)

[Document(page_content='What is alpha-1 antitrypsin deficiency?\nAlpha-1 antitrypsin (AAT) deficiency is a condition that raises your risk for lung and other diseases.\n\nAAT is a protein made in your liver to help protect the lungs. If your body does not make enough AAT, your lungs are more easily damaged from smoking, pollution, or dust from the environment. This can lead to COPD or bronchiectasis, another lung disease. AAT deficiency may also cause liver disease. The liver disease can occur among infants and children, and the lung disease usually occurs in individuals who are older than 30.\n\nAAT deficiency runs in families. Many people do not know that they have it, but early diagnosis can help prevent COPD and other serious lung diseases. Talk to your healthcare provider if you have a family member who has AAT deficiency or who was a smoker diagnosed with COPD between ages 40 and 50. Also, talk to your provider if you have symptoms such as an ongoing cough, shortness of breath, w

In [None]:
eval_generate_response(eval_query)

'Individuals with a family history of AAT deficiency or those who have a family member diagnosed with COPD between ages 40 and 50 are at risk for Alpha-1 Antitrypsin Deficiency.'

in this part, we will test this code using 10 questions from MedQuAD.

In [None]:
list_of_questions = [
  "Who is at risk for Alpha-1 Antitrypsin Deficiency?",
  "How to diagnose Parasites - Ascariasis ?",
  "Do you have information about B Vitamins",
  "What are the treatments for Alzheimer disease ?",
  "What is the outlook for Childhood Acute Myeloid Leukemia and Other Myeloid Malignancies ?",
  "What are the symptoms of Pulmonary Hypertension ?",
  "What are the treatments for Parasites - Taeniasis ?",
  "What is (are) Athlete's Foot ?",
  "What are the genetic changes related to arrhythmogenic right ventricular cardiomyopathy ?",
  "Who is at risk for Pituitary Tumors? ?"
]

In [None]:
answer_from_questions = []
for question in list_of_questions:
  eval_query = question
  answer = eval_generate_response(eval_query)
  answer_from_questions.append(answer)

don't forget to save the answer to `MedQuAD - Evaluation.csv` so that we can compare it with the golden answer

In [None]:
answer_from_questions[9]

'Individuals with certain genetic conditions are at risk for developing pituitary tumors.'

# Step 7 : Evaluation

we evaluate two components of this app:
* the IR using cosine similarity
* the RAG using BLEURT, BLEU, and METEOR
<br>

we evaluate the system using two set of data:
1. using the all the abstracts + 10 pairs of QA from MedQuAD that we picked randomly. we saved it in `chroma_db`. this evaluation is performed in section 7.1.
2. using only 40 abstracts + 10 pairs of QA from MedQuAD (same as point 1). we saved it in `eval_chroma_db`. this evaluation is performed in section 7.2.
<br>

**Result Analysis**:
- Based on the results, the retriever score in section 7.1 is worse than in 7.2. This is because the documents used in 7.1 are more numerous compared to those in 7.2, meaning there are more documents discussing the same topics and the probability of choosing the golden passages is smaller.
- For the RAG evaluation, the answer in 7.1 is longer than in 7.2. This occurred because the LLM received more context, as we used more datasets. However, when we evaluated the score using BLEURT, the answer in 7.2 was better than in 7.1, likely because it was shorter but more to the point. On the other hand, when we checked the METEOR and BLEU scores, the scores in 7.2 were worse than in 7.1. These scores rely on n-grams as their basis, which means that the exact words may not appear if the answers are shorter.

In [None]:
import pandas as pd

In [None]:
medquad_eval_csv_location = "FILE LOCATION HERE"

for example:
```
medquad_eval_csv_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/MedQuAD - Evaluation.csv"
```

In [None]:
medquad_eval_df = pd.read_csv(medquad_eval_csv_location)
medquad_eval_df.head()

Unnamed: 0,medquad_url,title,passage,question_id,question,answer,LLM Answers,LLM Eval Answers
0,https://github.com/abachaa/MedQuAD/blob/master...,Alpha-1 Antitrypsin Deficiency,What is alpha-1 antitrypsin deficiency?\nAlpha...,0000001-3,Who is at risk for Alpha-1 Antitrypsin Deficie...,Alpha-1 antitrypsin (AAT) deficiency occurs in...,Individuals who have a family member with Alph...,Individuals with a family history of AAT defic...
1,https://github.com/abachaa/MedQuAD/blob/master...,,An estimated 807 million–1.2 billion people in...,0000030-5,How to diagnose Parasites - Ascariasis ?,The standard method for diagnosing ascariasis ...,Ascariasis can be diagnosed by a healthcare pr...,Ascariasis is typically diagnosed by examining...
2,https://github.com/abachaa/MedQuAD/blob/master...,,Summary\nThe B vitamins are:\n\nB1 (thiamine)\...,0000075-1,Do you have information about B Vitamins,Summary : The B vitamins are - B1 (thiam...,"Yes, B vitamins are essential nutrients that p...","Yes, B vitamins are essential for energy produ..."
3,https://github.com/abachaa/MedQuAD/blob/master...,Alzheimer's disease,Description\n\nAlzheimer's disease is a degene...,0000048-5,What are the treatments for Alzheimer disease ?,These resources address the diagnosis or manag...,The treatments for Alzheimer's disease include...,"Currently, there is no cure for Alzheimer's di..."
4,https://github.com/abachaa/MedQuAD/blob/master...,Childhood Acute Myeloid Leukemia/Other Myeloid...,General Information About Childhood Acute Myel...,0000001_7-5,What is the outlook for Childhood Acute Myeloi...,Certain factors affect prognosis (chance of re...,"Over the past 20 years, the outcome of acute m...",The outlook for Childhood Acute Myeloid Leukem...


## 7.1. Using All Data

In [None]:
chroma_db_location = "ENTER LOCATION OF CHROMA DB FOLDER HERE"
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=chroma_db_location, embedding_function=embedding_function)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})



example:
```
chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/chroma_db/"
```

### 7.1.1. IR Evaluation

In [None]:
import numpy as np

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    # Tokenize the texts
    vectorizer = CountVectorizer().fit_transform([text1, text2])

    # Convert the token counts to vectors
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))

    return cosine_sim[0][0]

# Example texts
text1 = "This is the first text."
text2 = "This is the second text."

# Calculate cosine similarity
similarity_score = calculate_cosine_similarity(text1, text2)
print("Cosine Similarity:", similarity_score)

Cosine Similarity: 0.7999999999999999


In [None]:
passages = medquad_eval_df["passage"].tolist()
questions = medquad_eval_df["question"].tolist()

In [None]:
results = []
for passage, question in zip(questions, passages):
  scores = []
  passage = passage.replace("\n", " ").strip()
  docs = retriever.invoke(question)

  for doc in docs:
    other_passage = doc.page_content.replace("\n", " ").strip()
    score = calculate_cosine_similarity(passage, other_passage)
    scores.append(score)
  results.append(scores)

In [None]:
top1 = [scores[0] for scores in results]
print("Top-1 Cosine Score:", round(np.mean(top1), 2))

top3 = [max(scores[:3]) for scores in results]
print("Top-3 Cosine Score:", round(np.mean(top3), 2))

top6 = [max(scores) for scores in results]
print("Top-6 Cosine Score:", round(np.mean(top6), 2))

Top-1 Cosine Score: 0.29
Top-3 Cosine Score: 0.35
Top-6 Cosine Score: 0.35


### 7.1.2. RAG Evaluation

In [None]:
references = medquad_eval_df["answer"].tolist()
candidates = medquad_eval_df["LLM Answers"].tolist()

In [None]:
candidates[0], references[0]

("Individuals who have a family member with Alpha-1 Antitrypsin (AAT) deficiency or who have a family member who was a smoker diagnosed with COPD between ages 40 and 50 are at risk for AAT deficiency. Additionally, those planning to have children and suspect they may be carriers of the deficiency may also put their children at risk.'",
 'Alpha-1 antitrypsin (AAT) deficiency occurs in all ethnic groups. However, the condition occurs most often in White people of European descent.\n                \nAAT deficiency is an inherited condition. &quot;Inherited&quot; means the condition is passed from parents to children through genes.\n                \nIf you have bloodline relatives with known AAT deficiency, you&apos;re at increased risk for the condition. Even so, it doesn&apos;t mean that you&apos;ll develop one of the diseases related to the condition.\n                \nSome risk factors make it more likely that you&apos;ll develop lung disease if you have AAT deficiency. Smoking is t

In [None]:
!pip install datasets -q
!pip install git+https://github.com/google-research/bleurt.git -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


In [None]:
from datasets import load_metric
import numpy as np

bleurt = load_metric("bleurt", module_type="metric", checkpoint="bleurt-large-512")

bleurt_score = bleurt.compute(references=references, predictions=candidates)
print(bleurt_score)
print(np.mean(bleurt_score["scores"]))

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'scores': [-0.8870161175727844, -0.24964717030525208, -0.3459647297859192, -0.87115478515625, -0.6978392004966736, -0.6286903023719788, -0.7655577063560486, -0.14378045499324799, -0.23604974150657654, -0.7207397818565369]}
-0.5546439990401268


In [None]:
meteor_metric = load_metric("meteor")

meteor_score = meteor_metric.compute(predictions=candidates, references=references)
meteor_score

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


{'meteor': 0.20400509177989562}

In [None]:
from typing import List

bleu_metric = load_metric("bleu")

def convert_to_bleu_format(refs: List[str], cands: List[str]):
  cands = [string.strip().replace("\n", " ").split() for string in cands]
  refs = [[string.strip().replace("\n", " ").split()] for string in refs]

  return refs, cands

bleu_refs, bleu_cands = convert_to_bleu_format(references, candidates)

# Compute BLEU score
bleu_score = bleu_metric.compute(predictions=bleu_cands, references=bleu_refs)
print("BLEU Score:", bleu_score["bleu"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


BLEU Score: 0.03506960415481016


## 7.2. Using Evaluation Data

In [None]:
eval_chroma_db_location = "ENTER LOCATION OF EVAL CHROMA DB FOLDER HERE"
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=eval_chroma_db_location, embedding_function=embedding_function)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

example:
```
eval_chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/eval_chroma_db/"
```

### 7.2.1. IR Evaluation

In [None]:
import numpy as np

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
    # Tokenize the texts
    vectorizer = CountVectorizer().fit_transform([text1, text2])

    # Convert the token counts to vectors
    vectors = vectorizer.toarray()

    # Calculate cosine similarity
    cosine_sim = cosine_similarity(vectors[0].reshape(1, -1), vectors[1].reshape(1, -1))

    return cosine_sim[0][0]

# Example texts
text1 = "This is the first text."
text2 = "This is the second text."

# Calculate cosine similarity
similarity_score = calculate_cosine_similarity(text1, text2)
print("Cosine Similarity:", similarity_score)

Cosine Similarity: 0.7999999999999999


In [None]:
passages = medquad_eval_df["passage"].tolist()
questions = medquad_eval_df["question"].tolist()

In [None]:
results = []
for passage, question in zip(questions, passages):
  scores = []
  passage = passage.replace("\n", " ").strip()
  docs = retriever.invoke(question)

  for doc in docs:
    other_passage = doc.page_content.replace("\n", " ").strip()
    score = calculate_cosine_similarity(passage, other_passage)
    scores.append(score)
  results.append(scores)

In [None]:
top1 = [scores[0] for scores in results]
print("Top-1 Cosine Score:", round(np.mean(top1), 2))

top3 = [max(scores[:3]) for scores in results]
print("Top-3 Cosine Score:", round(np.mean(top3), 2))

top6 = [max(scores) for scores in results]
print("Top-6 Cosine Score:", round(np.mean(top6), 2))

Top-1 Cosine Score: 0.3
Top-3 Cosine Score: 0.34
Top-6 Cosine Score: 0.37


### 7.2.2. RAG Evaluation

In [None]:
references = medquad_eval_df["answer"].tolist()
candidates = medquad_eval_df["LLM Eval Answers"].tolist()

In [None]:
candidates[0], references[0]

('Individuals with a family history of AAT deficiency, those with symptoms such as an ongoing cough, shortness of breath, wheezing, or liver disease, and individuals planning to have children and think they are at risk of having AAT deficiency are at risk for Alpha-1 Antitrypsin Deficiency.\n',
 'Alpha-1 antitrypsin (AAT) deficiency occurs in all ethnic groups. However, the condition occurs most often in White people of European descent.\n                \nAAT deficiency is an inherited condition. &quot;Inherited&quot; means the condition is passed from parents to children through genes.\n                \nIf you have bloodline relatives with known AAT deficiency, you&apos;re at increased risk for the condition. Even so, it doesn&apos;t mean that you&apos;ll develop one of the diseases related to the condition.\n                \nSome risk factors make it more likely that you&apos;ll develop lung disease if you have AAT deficiency. Smoking is the leading risk factor for serious lung di

In [None]:
!pip install datasets -q
!pip install git+https://github.com/google-research/bleurt.git -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


In [None]:
from datasets import load_metric
import numpy as np

bleurt = load_metric("bleurt", module_type="metric", checkpoint="bleurt-large-512")

bleurt_score = bleurt.compute(references=references, predictions=candidates)
print(bleurt_score)
print(np.mean(bleurt_score["scores"]))

  bleurt = load_metric("bleurt", module_type="metric", checkpoint="bleurt-large-512")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.96k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

{'scores': [-0.7659021019935608, -0.034509796649217606, -0.11388298869132996, -0.8057114481925964, -0.1346392035484314, -0.6286903023719788, -0.9406319856643677, -0.3126817047595978, -0.3287384808063507, -0.7410250306129456]}
-0.4806413043290377


In [None]:
meteor_metric = load_metric("meteor")

meteor_score = meteor_metric.compute(predictions=candidates, references=references)
meteor_score

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


{'meteor': 0.154063044222608}

In [None]:
from typing import List

bleu_metric = load_metric("bleu")

def convert_to_bleu_format(refs: List[str], cands: List[str]):
  cands = [string.strip().replace("\n", " ").split() for string in cands]
  refs = [[string.strip().replace("\n", " ").split()] for string in refs]

  return refs, cands

bleu_refs, bleu_cands = convert_to_bleu_format(references, candidates)

# Compute BLEU score
bleu_score = bleu_metric.compute(predictions=bleu_cands, references=bleu_refs)
print("BLEU Score:", bleu_score["bleu"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

BLEU Score: 0.008182656880157462


# Step 8: Streamlit DEMO

Compiling everything into an app.py file for streamlit demo

In [None]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared-linux-amd64

--2024-05-16 12:16:41--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2024.4.1/cloudflared-linux-amd64 [following]
--2024-05-16 12:16:41--  https://github.com/cloudflare/cloudflared/releases/download/2024.4.1/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/338c4db6-d448-42d7-9218-0662e513e932?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240516%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240516T121534Z&X-Amz-Expires=300&X-Amz-Signature=f72d8d89cac8d4c810f6a081d2efec7ef947da1d4d3d84bccdb267feafdefe13&X-Amz-S

In [None]:
!chmod +x cloudflared-linux-amd64

In [None]:
!nohup /content/cloudflared-linux-amd64 tunnel --url http://localhost:8501 &

nohup: appending output to 'nohup.out'


In [None]:
%%writefile app.py
import os
from deep_translator import GoogleTranslator
from langdetect import detect
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)

os.environ["OPENAI_API_KEY"] = "ENTER OPENAI API KEY HERE"
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

chroma_db_location = "/content/drive/MyDrive/Colab Notebooks/NLP Application II/Final Project APP II/chroma_db/"
chroma_db_location_ibm = "/content/drive/MyDrive/Colab Notebooks/nlp-app-II/data/RAG_Project/chroma_db/"

vectorstore = Chroma(persist_directory=chroma_db_location, embedding_function=embedding_function)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 6})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def generate_response(question):
    template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.

    {context}

    Question: {question}

    Helpful Answer:"""
    custom_rag_prompt = PromptTemplate.from_template(template)

    rag_chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | custom_rag_prompt
        | llm
        | StrOutputParser()
    )

    response = rag_chain.invoke(question)
    return response


def translate_to_eng(text):
    translation = GoogleTranslator(source='auto', target='en').translate(text)
    return translation

def translate_from_eng(text, lang):
    translation= GoogleTranslator(source='en', target=lang).translate(text)
    return translation

def ask_RAG(prompt):
    detected_language = detect(prompt)
    if detected_language == 'en':
        response = generate_response(prompt)
    else:
        translated_prompt = translate_to_eng(prompt)
        print(f"[LOG] Question: {translated_prompt}")
        response = generate_response(translated_prompt)
        print(f"[LOG] Response: {response}")
        response = translate_from_eng(response, detected_language)
    return response

import streamlit as st

st.title('Your Medical Chatbot')

user_input = st.text_input("Ask a question:")
if st.button('Send'):
    if user_input:
        answer = ask_RAG(user_input)
        st.write("Answer:", answer)
    else:
        st.write("Please enter a question.")


Writing app.py


In [None]:
!streamlit run /content/app.py &>/content/logs.txt &

In [None]:
!grep -o 'https://.*\.trycloudflare.com' nohup.out | head -n 1 | xargs -I {} echo "Your tunnel url {}"