# Test notebook for generating embeddings
- https://huggingface.co/spaces/mteb/leaderboard

## Structure
- Get text
- Chunk text
- Embed text
- Retrieve text

## TODO
- https://stackoverflow.com/questions/10993612/how-to-remove-xa0-from-string-in-python
- Improve embedding model. Current embedding model (all-MiniLM-L6-v2 is ranked 42th on the MTEB)
- Improve vector store. QDrant?
- To write benchmarking function, see how long each embedding model take.
- Improve prompt model. Refer to https://github.com/tinygrad/tinygrad/blob/master/examples/llama.py
- Use better model: Mistral-7B-instruct?? Refer to instruction format. https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1

## Get text

In [None]:
import os 
import glob

# Load data
documents = []
for name in glob.glob('./data/*.md'): 
    print(name) 
    documents.append(open(name, 'r').read())

len(documents)

## Chunk texts

In [None]:
import re

text = documents[0]

# Find and print all matching headers
header_pattern = r'^#+ .+'
headers = re.findall(header_pattern, text, re.MULTILINE)
headers_to_split_on = []
for i, header in enumerate(headers):
    headers_to_split_on.append((header, f'Header {i}'))

headers_to_split_on

In [None]:
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from langchain.vectorstores import Qdrant
# from langchain.document_loaders import TextLoader

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(text)

# Optional: We can do recursive splitting within each document
# # Char-level splits

# chunk_size = 250
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

# # Split
# splits = text_splitter.split_documents(md_header_splits)
# splits

In [None]:
# Putting it together as a function
# TODO: Document loading seems to remove the markdown headers by default. Is there any way to keep the headers?
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

def chunk(doc):
    """
    Args:
    doc - in Document format
    
    Output:
    md_header_splits - markdown split format. Gives a more principled level of splitting 
    """
    text = doc.page_content
    header_pattern = r'^#+ .+'
    headers = re.findall(header_pattern, text, re.MULTILINE)
    headers_to_split_on = []
    for i, header in enumerate(headers):
        headers_to_split_on.append((header, f'Header {i}'))
    return headers_to_split_on
    # markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    # md_header_splits = markdown_splitter.split_text(text)

    # # Optional char-level splits within each document 
    # chunk_size = 250
    # chunk_overlap = 30
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=chunk_size, chunk_overlap=chunk_overlap
    # )
    # splits = text_splitter.split_documents(md_header_splits)
    # return splits


chunk(documents[0])

## Other embedding models

In [None]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())


## Retriever

In [None]:
from langchain.vectorstores import Qdrant
qdrant = Qdrant.from_documents(
    docs, embeddings, 
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)


## TF-IDF

In [None]:
from langchain.retrievers import TFIDFRetriever
retriever = TFIDFRetriever.from_texts(["foo", "bar", "world", "hello", "foo bar"])
result = retriever.get_relevant_documents("foo")
result

## Main

In [1]:
%%time
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

# Step 1: Load the document(s) and split it into chunks
# loader = DirectoryLoader("./data", glob = "*.md")
# documents = loader.load()
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# chunks = text_splitter.split_documents(documents)

# Step 2: Create embeddings
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# chunks2 = [x.page_content for x in chunks] # HFEmbeddings only accepts str
# embeddings = embedding_model.embed_documents(chunks2)

# Step 3: Store embeddings in ChromaDB and save locally.
# db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# Step 4: Create a retriever
retriever = db.as_retriever()

CPU times: user 3.34 s, sys: 2.07 s, total: 5.41 s
Wall time: 3.22 s


In [2]:
from llama_cpp import Llama

def create_completion(question):
    # Step 5: Define the prompt template
    # source: https://chat.openai.com/share/c85e64f6-4dd2-4920-b82e-78f128898cbb
    template = """<s>[INST]You are now an expert in Jane Austen's novels. 

    
    Here is an excerpt of text you may refer to:
    {context}
    
    {question}[/INST]"""
    prompt = ChatPromptTemplate.from_template(template)
    
    # Step 6: Generate a query and search for relevant chunks
    context = db.similarity_search(question)[0].page_content
    final_prompt = prompt.format_messages(context = context, question = question)[0].content
    
    # Step 7: Use llama-cpp-python as a prototype. 
    llm = Llama(model_path="../../llama.cpp/models/mistral-instruct/ggml-model-q4_0.gguf", n_ctx=8000)
    output = llm.create_completion(final_prompt,
                                   suffix=None,
                                   max_tokens=0, # set this to 0 for no limit on tokens (depend on n_ctx)
                                   temperature=0.8, # higher temperature, more factual.
                                   top_p=0.95,
                                   logprobs=None,
                                   echo=True,
                                   stop=[],
                                   frequency_penalty=0.0,
                                   presence_penalty=0.0,
                                   repeat_penalty=1.1,
                                   top_k=40,
                                   stream=False,
                                   tfs_z=1.0,
                                   mirostat_mode=0,
                                   mirostat_tau=5.0,
                                   mirostat_eta=0.1,
                                   model=None,
                                   stopping_criteria=None,
                                   logits_processor=None)
    return output
# question = "Why is Lizzy Mr. Bennet’s favorite daughter?"
# create_completion(question)

## This time around, use more factual questions
Source : https://www.sparknotes.com/lit/pride/quiz/

### Question 1

In [None]:
final_outputs = []
question = "The Bennet family lives in the village of ..."
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [4]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 

    
    Here is an excerpt of text you may refer to:
    Mr. Bennet’s property consisted almost entirely in an estate of two thousand a year, which, unfortunately for his daughters, was entailed, in default of heirs male, on a distant relation; and their mother’s fortune, though ample for her situation in life, could but ill supply the deficiency of his. Her father had been an attorney in Meryton, and had left her four thousand pounds.

She had a sister married to a Mr. Phillips, who had been a clerk to their father and succeeded him in the business, and a brother settled in London in a respectable line of trade.
    
    The Bennet family lives in the village of ...[/INST] The Bennet family lives in Meryton, an imaginary village in England.
<s>[INST]You are now an expert in Jane Austen's novels. 

    
    Here is an excerpt of text you may refer to:
    Mr. Bennet’s property consisted almost entirely in an estate of two thous

### Question 2

In [None]:
final_outputs = []
question = "Mr. Bingley, when he attends the ball in Meryton, seems to be quite taken with ..."
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [6]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 

    
    Here is an excerpt of text you may refer to:
    Mr. Bingley and Jane remained at Netherfield only a twelvemonth. So near a vicinity to her mother and Meryton relations was not desirable even to his easy temper, or her affectionate heart. The darling wish of his sisters was then gratified; he bought an estate in a neighbouring county to Derbyshire, and Jane and Elizabeth, in addition to every other source of happiness, were within thirty miles of each other.
    
    Mr. Bingley, when he attends the ball in Meryton, seems to be quite taken with ...[/INST] As an expert in Jane Austen's novels, I can provide some context and analysis on the excerpt you provided.

The text you provided comes from "Pride and Prejudice," one of Austen's most famous works. It describes Mr. Bingley and his relationship with Jane Bennet, the protagonist of the novel. The two are initially acquainted at the beginning of the book, but their feeli

### Question 3

In [None]:
final_outputs = []
question = "How does Mr. Darcy offend Elizabeth at the first ball?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [8]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 

    
    Here is an excerpt of text you may refer to:
    He shook his head. “I wish I could call her amiable. It gives me pain to speak ill of a Darcy. But she is too much like her brother—very, very proud. As a child, she was affectionate and pleasing, and extremely fond of me; and I have devoted hours and hours to her amusement. But she is nothing to me now. She is a handsome girl, about fifteen or sixteen, and, I understand, highly accomplished. Since her father’s death, her home has been London, where a lady lives with her, and superintends her education.”

After many pauses and many trials of other subjects, Elizabeth could not help reverting once more to the first, and saying:

“I am astonished at his intimacy with Mr. Bingley! How can Mr. Bingley, who seems good humour itself, and is, I really believe, truly amiable, be in friendship with such a man? How can they suit each other? Do you know Mr. Bingley?”

“Not at all.”


## Baseline: No RAG

In [9]:
from llama_cpp import Llama

def create_no_rag_completion(question):
    # Step 5: Define the prompt template
    # source: https://chat.openai.com/share/c85e64f6-4dd2-4920-b82e-78f128898cbb
    template = """<s>[INST]You are now an expert in Jane Austen's novels. 
    
    {question}[/INST]"""
    prompt = ChatPromptTemplate.from_template(template)
    final_prompt = prompt.format_messages(question = question)[0].content
    
    # Step 7: Use llama-cpp-python as a prototype. 
    llm = Llama(model_path="../../llama.cpp/models/mistral-instruct/ggml-model-q4_0.gguf", n_ctx=8000)
    output = llm.create_completion(final_prompt,
                                   suffix=None,
                                   max_tokens=0, # set this to 0 for no limit on tokens (depend on n_ctx)
                                   temperature=0.8, # higher temperature, more factual.
                                   top_p=0.95,
                                   logprobs=None,
                                   echo=True,
                                   stop=[],
                                   frequency_penalty=0.0,
                                   presence_penalty=0.0,
                                   repeat_penalty=1.1,
                                   top_k=40,
                                   stream=False,
                                   tfs_z=1.0,
                                   mirostat_mode=0,
                                   mirostat_tau=5.0,
                                   mirostat_eta=0.1,
                                   model=None,
                                   stopping_criteria=None,
                                   logits_processor=None)
    return output
# question = "Why is Lizzy Mr. Bennet’s favorite daughter?"
# create_no_rag_completion(question)

### Question 1

In [None]:
final_outputs = []
question = "The Bennet family lives in the village of ..."
while 1:
    try:
        final_outputs.append(create_no_rag_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [11]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 
    
    The Bennet family lives in the village of ...[/INST] The Bennet family lives in the village of Meryton, which is located in the fictional county of Hertfordshire, England, in the late 18th century.
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    The Bennet family lives in the village of ...[/INST] the Bennet family lives in the village of Meryton, located in the fictional county of Hertfordshire, England.
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    The Bennet family lives in the village of ...[/INST] Meryton
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    The Bennet family lives in the village of ...[/INST] The Bennet family, the central characters of Jane Austen's "Pride and Prejudice", resides in the village of Meryton in Hertfordshire, England during the Regency era.
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    The Bennet family lives in th

### Question 2

In [None]:
final_outputs = []
question = "Mr. Bingley, when he attends the ball in Meryton, seems to be quite taken with ..."
while 1:
    try:
        final_outputs.append(create_no_rag_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [13]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 
    
    Mr. Bingley, when he attends the ball in Meryton, seems to be quite taken with ...[/INST] Elizabeth Bennet, who attends the same ball. The two strike up a conversation and become fast friends, bonding over their shared love of books and intellectual pursuits. As the night wears on, it becomes clear that Mr. Bingley is smitten with Elizabeth, but she is hesitant to reciprocate his feelings due to her family's lower social standing. However, as they continue to engage in lively debates and discussions, Elizabeth begins to see Mr. Bingley in a new light and realizes that he may be the one for her after all.
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    Mr. Bingley, when he attends the ball in Meryton, seems to be quite taken with ...[/INST] Mr. Darcy. Despite his initial aloofness and reserve, Mr. Bingley is intrigued by Mr. Darcy's intelligence, charm, and wit. He also recognizes Mr. Darcy's social stat

### Question 3

In [None]:
final_outputs = []
question = "How does Mr. Darcy offend Elizabeth at the first ball?"
while 1:
    try:
        final_outputs.append(create_no_rag_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [15]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

<s>[INST]You are now an expert in Jane Austen's novels. 
    
    How does Mr. Darcy offend Elizabeth at the first ball?[/INST] At the first ball, Mr. Darcy offends Elizabeth by behaving in a cold and aloof manner, which makes her feel uncomfortable and unimportant. He also makes an offensive comment about her family's connections to the lower class, which Elizabeth finds insulting. Additionally, he dances with another woman, causing Elizabeth to feel jealous and hurt. All of these actions contribute to Elizabeth forming a negative first impression of Mr. Darcy.
<s>[INST]You are now an expert in Jane Austen's novels. 
    
    How does Mr. Darcy offend Elizabeth at the first ball?[/INST] Mr. Darcy offends Elizabeth at the first ball by making her feel like she is not socially acceptable enough to dance with him. When they are introduced, he makes a point to say that he was only dancing with her because his friend, Bingley, had convinced him to do so. He also seems somewhat cold and dis

In [None]:
# # Sample code for perplexity taken from: https://www.educative.io/answers/what-is-perplexity-in-nlp
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import load_dataset
# from tqdm import tqdm

# # Setup device and model parameters
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name = "gpt2-large"

# # Load the model and the tokenizer
# # - https://stackoverflow.com/questions/64001128/load-a-pre-trained-model-from-disk-with-huggingface-transformers
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Load test dataset
# wikitext_test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# # Text encoding
# encoded_text = tokenizer("\n\n".join(wikitext_test["text"]), return_tensors="pt")

# # Model configuration parameters
# max_length = model.config.n_positions
# stride = 512
# sequence_length = encoded_text.input_ids.size(1)

# # Initialize negative log likelihoods list and previous end location
# negative_log_likelihoods = []
# previous_end_loc = 0

# # Processing data in strides
# for start_loc in tqdm(range(0, sequence_length, stride)):
#     end_loc = min(start_loc + max_length, sequence_length)
#     target_length = end_loc - previous_end_loc

#     # Prepare input and target ids
#     input_ids = encoded_text.input_ids[:, start_loc:end_loc].to(device)
#     target_ids = input_ids.clone()
#     target_ids[:, :-target_length] = -100

#     # Calculate negative log likelihood without gradient computation
#     with torch.no_grad():
#         model_output = model(input_ids, labels=target_ids)
#         nll = model_output.loss
#         negative_log_likelihoods.append(nll)

#     previous_end_loc = end_loc
#     if end_loc == sequence_length:
#         break

# # Calculate perplexity
# perplexity = torch.exp(torch.stack(negative_log_likelihoods).mean())
