# Test notebook for generating embeddings
- https://huggingface.co/spaces/mteb/leaderboard

## Structure
- Get text
- Chunk text
- Embed text
- Retrieve text

## TODO
- https://stackoverflow.com/questions/10993612/how-to-remove-xa0-from-string-in-python
- Improve embedding model. Current embedding model (all-MiniLM-L6-v2 is ranked 42th on the MTEB)
- Improve vector store. QDrant?
- To write benchmarking function, see how long each embedding model take. 

## Get text

In [None]:
import os 
import glob

# Load data
documents = []
for name in glob.glob('./data/*.md'): 
    print(name) 
    documents.append(open(name, 'r').read())

len(documents)

## Chunk texts

In [None]:
import re

text = documents[0]

# Find and print all matching headers
header_pattern = r'^#+ .+'
headers = re.findall(header_pattern, text, re.MULTILINE)
headers_to_split_on = []
for i, header in enumerate(headers):
    headers_to_split_on.append((header, f'Header {i}'))

headers_to_split_on

In [None]:
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from langchain.vectorstores import Qdrant
# from langchain.document_loaders import TextLoader

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(text)

# Optional: We can do recursive splitting within each document
# # Char-level splits

# chunk_size = 250
# chunk_overlap = 30
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=chunk_size, chunk_overlap=chunk_overlap
# )

# # Split
# splits = text_splitter.split_documents(md_header_splits)
# splits

In [None]:
# Putting it together as a function
# TODO: Document loading seems to remove the markdown headers by default. Is there any way to keep the headers?
from langchain.text_splitter import CharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

def chunk(doc):
    """
    Args:
    doc - in Document format
    
    Output:
    md_header_splits - markdown split format. Gives a more principled level of splitting 
    """
    text = doc.page_content
    header_pattern = r'^#+ .+'
    headers = re.findall(header_pattern, text, re.MULTILINE)
    headers_to_split_on = []
    for i, header in enumerate(headers):
        headers_to_split_on.append((header, f'Header {i}'))
    return headers_to_split_on
    # markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    # md_header_splits = markdown_splitter.split_text(text)

    # # Optional char-level splits within each document 
    # chunk_size = 250
    # chunk_overlap = 30
    # text_splitter = RecursiveCharacterTextSplitter(
    #     chunk_size=chunk_size, chunk_overlap=chunk_overlap
    # )
    # splits = text_splitter.split_documents(md_header_splits)
    # return splits


chunk(documents[0])

## Other embedding models

In [None]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = [
    "what is the capital of China?",
    "how to implement quick sort in python?",
    "Beijing",
    "sorting algorithms"
]

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
scores = (embeddings[:1] @ embeddings[1:].T) * 100
print(scores.tolist())


## Retriever

In [None]:
from langchain.vectorstores import Qdrant
qdrant = Qdrant.from_documents(
    docs, embeddings, 
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)


## TF-IDF

In [None]:
from langchain.retrievers import TFIDFRetriever
retriever = TFIDFRetriever.from_texts(["foo", "bar", "world", "hello", "foo bar"])
result = retriever.get_relevant_documents("foo")
result

## Main

In [1]:
%%time
from langchain.document_loaders import TextLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

# Step 1: Load the document(s) and split it into chunks
# loader = DirectoryLoader("./data", glob = "*.md")
# documents = loader.load()
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
# chunks = text_splitter.split_documents(documents)

# Step 2: Create embeddings
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# chunks2 = [x.page_content for x in chunks] # HFEmbeddings only accepts str
# embeddings = embedding_model.embed_documents(chunks2)

# Step 3: Store embeddings in ChromaDB and save locally.
# db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding_model)

# Step 4: Create a retriever
retriever = db.as_retriever()

CPU times: user 3.41 s, sys: 3.03 s, total: 6.44 s
Wall time: 2.77 s


In [None]:
from llama_cpp import Llama

def create_completion(question):
    # Step 5: Define the prompt template
    # source: https://chat.openai.com/share/c85e64f6-4dd2-4920-b82e-78f128898cbb
    template = """You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    {context}
    
    # Question: {question}
    # Answer:"""
    prompt = ChatPromptTemplate.from_template(template)
    
    # Step 6: Generate a query and search for relevant chunks
    context = db.similarity_search(question)[0].page_content
    final_prompt = prompt.format_messages(context = context, question = question)[0].content
    
    # Step 7: Use llama-cpp-python as a prototype. 
    llm = Llama(model_path="../../llama.cpp/models/mistral/ggml-model-q4_0.gguf", n_ctx=8000)
    output = llm.create_completion(final_prompt,
                                   suffix=None,
                                   max_tokens=0, # set this to 0 for no limit on tokens (depend on n_ctx)
                                   temperature=0.8, # higher temperature, more factual.
                                   top_p=0.95,
                                   logprobs=None,
                                   echo=True,
                                   stop=[],
                                   frequency_penalty=0.0,
                                   presence_penalty=0.0,
                                   repeat_penalty=1.1,
                                   top_k=40,
                                   stream=False,
                                   tfs_z=1.0,
                                   mirostat_mode=0,
                                   mirostat_tau=5.0,
                                   mirostat_eta=0.1,
                                   model=None,
                                   stopping_criteria=None,
                                   logits_processor=None)
    return output
question = "Why is Lizzy Mr. Bennet’s favorite daughter?"
create_completion(question)

### Question 1

In [None]:
final_outputs = []
question = "Why is Lizzy Mr. Bennet’s favorite daughter?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [13]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    “Sir, you quite misunderstand me,” said Mrs. Bennet, alarmed. “Lizzy is only headstrong in such matters as these. In everything else she is as good-natured a girl as ever lived. I will go directly to Mr. Bennet, and we shall very soon settle it with her, I am sure.”

She would not give him time to reply, but hurrying instantly to her husband, called out as she entered the library, “Oh! Mr. Bennet, you are wanted immediately; we are all in an uproar. You must come and make Lizzy marry Mr. Collins, for she vows she will not have him, and if you do not make haste he will change his mind and not have her.”

Mr. Bennet raised his eyes from his book as she entered, and fixed them on her face with a calm unconcern which was not in the least altered by her communication.

“I have not the pleasure of understanding you,” said he, when she had finished her speech. “Of what are you talking?”
 

### Question 2

In [None]:
final_outputs = []
question = "Why does Darcy dislike Wickham?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [15]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    Wickham’s alarm now appeared in a heightened complexion and agitated look; for a few minutes he was silent, till, shaking off his embarrassment, he turned to her again, and said in the gentlest of accents:

“You, who so well know my feeling towards Mr. Darcy, will readily comprehend how sincerely I must rejoice that he is wise enough to assume even the appearance of what is right. His pride, in that direction, may be of service, if not to himself, to many others, for it must only deter him from such foul misconduct as I have suffered by. I only fear that the sort of cautiousness to which you, I imagine, have been alluding, is merely adopted on his visits to his aunt, of whose good opinion and judgement he stands much in awe. His fear of her has always operated, I know, when they were together; and a good deal is to be imputed to his wish of forwarding the match with Miss de Bourgh,

### Question 3

In [None]:
final_outputs = []
question = "Why does Lizzy form a negative first impression of Darcy?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [17]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    “There certainly was some great mismanagement in the education of those two young men. One has got all the goodness, and the other all the appearance of it.”

“I never thought Mr. Darcy so deficient in the appearance of it as you used to do.”

“And yet I meant to be uncommonly clever in taking so decided a dislike to him, without any reason. It is such a spur to one’s genius, such an opening for wit, to have a dislike of that kind. One may be continually abusive without saying anything just; but one cannot always be laughing at a man without now and then stumbling on something witty.”

“Lizzy, when you first read that letter, I am sure you could not treat the matter as you do now.”

“Indeed, I could not. I was uncomfortable enough, I may say unhappy. And with no one to speak to about what I felt, no Jane to comfort me and say that I had not been so very weak and vain and nonsensica

### Question 4

In [None]:
final_outputs = []
question = "Why does Lizzy reject Darcy’s first proposal to her?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [19]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    “I am quite sorry, Lizzy, that you should be forced to have that disagreeable man all to yourself. But I hope you will not mind it: it is all for Jane’s sake, you know; and there is no occasion for talking to him, except just now and then. So, do not put yourself to inconvenience.”

During their walk, it was resolved that Mr. Bennet’s consent should be asked in the course of the evening. Elizabeth reserved to herself the application for her mother’s. She could not determine how her mother would take it; sometimes doubting whether all his wealth and grandeur would be enough to overcome her abhorrence of the man. But whether she were violently set against the match, or violently delighted with it, it was certain that her manner would be equally ill adapted to do credit to her sense; and she could no more bear that Mr. Darcy should hear the first raptures of her joy, than the first ve

### Question 5

In [None]:
final_outputs = []
question = "According to Mr. Darcy, what qualities make a woman “accomplished”?"
while 1:
    try:
        final_outputs.append(create_completion(question)["choices"][0]["text"])
        if len(final_outputs) == 5: break
    except:
        continue

In [21]:
for i, output in enumerate(final_outputs):
    print(f'========{i}=======')
    print(output)
    print('===================')

You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to:
    “Your list of the common extent of accomplishments,” said Darcy, “has too much truth. The word is applied to many a woman who deserves it no otherwise than by netting a purse or covering a screen. But I am very far from agreeing with you in your estimation of ladies in general. I cannot boast of knowing more than half-a-dozen, in the whole range of my acquaintance, that are really accomplished.”

“Nor I, I am sure,” said Miss Bingley.

“Then,” observed Elizabeth, “you must comprehend a great deal in your idea of an accomplished woman.”

“Yes, I do comprehend a great deal in it.”
    
    # Question: According to Mr. Darcy, what qualities make a woman “accomplished”?
    # Answer:

    
     

   

   

    

    

    

        - What does Elizabeth imply when she makes her comment?
You are now an expert in Jane Austen's novels. 
    
    Here is an excerpt of text you may refer to

## Evaluation methods
- Perplexity
- Fact checking - use Austen study notes

In [None]:
# # Sample code for perplexity taken from: https://www.educative.io/answers/what-is-perplexity-in-nlp
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import load_dataset
# from tqdm import tqdm

# # Setup device and model parameters
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model_name = "gpt2-large"

# # Load the model and the tokenizer
# # - https://stackoverflow.com/questions/64001128/load-a-pre-trained-model-from-disk-with-huggingface-transformers
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# # Load test dataset
# wikitext_test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# # Text encoding
# encoded_text = tokenizer("\n\n".join(wikitext_test["text"]), return_tensors="pt")

# # Model configuration parameters
# max_length = model.config.n_positions
# stride = 512
# sequence_length = encoded_text.input_ids.size(1)

# # Initialize negative log likelihoods list and previous end location
# negative_log_likelihoods = []
# previous_end_loc = 0

# # Processing data in strides
# for start_loc in tqdm(range(0, sequence_length, stride)):
#     end_loc = min(start_loc + max_length, sequence_length)
#     target_length = end_loc - previous_end_loc

#     # Prepare input and target ids
#     input_ids = encoded_text.input_ids[:, start_loc:end_loc].to(device)
#     target_ids = input_ids.clone()
#     target_ids[:, :-target_length] = -100

#     # Calculate negative log likelihood without gradient computation
#     with torch.no_grad():
#         model_output = model(input_ids, labels=target_ids)
#         nll = model_output.loss
#         negative_log_likelihoods.append(nll)

#     previous_end_loc = end_loc
#     if end_loc == sequence_length:
#         break

# # Calculate perplexity
# perplexity = torch.exp(torch.stack(negative_log_likelihoods).mean())
