In [None]:
!pip install -q langchain==0.1.16 langchain-cohere==0.1.4 langchain-community==0.0.34 cohere==5.3.3

In [None]:
from langchain_cohere import CohereEmbeddings
import os
os.environ["COHERE_API_KEY"] ='eo1K6TbppRIJlZcnn4bzImKQvytVckRHuEegVnnb'
model = CohereEmbeddings(model="embed-english-light-v3.0")

In [None]:
embedding1 = model.embed_query('''Infineon has identified a need for engineers, account managers, and customers to rapidly obtain
product information. This problem is traditionally addressed with retrieval-augmented generation
(RAG) chatbots, but in this study, I evaluated the use of the newly popularized RAG-Fusion method.
RAG-Fusion combines RAG and reciprocal rank fusion (RRF) by generating multiple queries,
reranking them with reciprocal scores and fusing the documents and scores. Through manually
evaluating answers on accuracy, relevance, and comprehensiveness, I found that RAG-Fusion was
able to provide accurate and comprehensive answers due to the generated queries contextualizing the
original query from various perspectives. However, some answers strayed off topic when the generated
queries’ relevance to the original query is insufficient. This research marks significant progress in
artificial intelligence (AI) and natural language processing (NLP) applications and demonstrates
transformations in a global and multi-industry context''')
len(embedding1)
# import numpy as np
# product = np.dot(embedding1, embedding2)
# norm = np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
# product/norm

1021

In [None]:
from enum import Enum
import numpy as np
from scipy.spatial.distance import jaccard, braycurtis, canberra, correlation
from scipy.stats import pearsonr
import typing as t
EMbedding = None

class SimilarityMode():
    """Modes for similarity/distance."""
    DEFAULT = ['cosine', 0.7]
    DOT_PRODUCT = ['dot_product', 0.7]  # Similar range as cosine similarity
    EUCLIDEAN = ['euclidean', -0.5]  # Negative values indicate similarity
    PEARSON = ['pearson', 0.7]  # Similar range as cosine similarity
    TANIMOTO = ['tanimoto', 0.7]
    CANBERRA = ['canberra', -10.0]  # Negative values indicate similarity, adjust based on data
    BRAY_CURTIS = ['bray_curtis', -0.3]  # Negative values indicate similarity
    MAHALANOBIS = ['mahalanobis', -1.0]  # Negative values indicate similarity, adjust based on data
    MINKOWSKI = ['minkowski', -0.5]  # Negative values indicate similarity, similar to Euclidean

def similarity_fn(embedding1: np.ndarray, embedding2: np.ndarray, mode: SimilarityMode = SimilarityMode.DEFAULT, p: float = 2.0) -> float:
    """Get embedding similarity."""
    if mode == SimilarityMode.EUCLIDEAN:
        # Using -euclidean distance as similarity to achieve same ranking order
        return -float(np.linalg.norm(np.array(embedding1) - np.array(embedding2)))
        # return -float(np.linalg.norm(embedding1 - embedding2))
    elif mode == SimilarityMode.DOT_PRODUCT:
        return np.dot(embedding1, embedding2)
    elif mode == SimilarityMode.PEARSON:
        return pearsonr(embedding1, embedding2)[0]
    elif mode == SimilarityMode.TANIMOTO:
        dot_product = np.dot(embedding1, embedding2)
        norm_sum = np.linalg.norm(embedding1) ** 2 + np.linalg.norm(embedding2) ** 2 - dot_product
        return dot_product / norm_sum
    elif mode == SimilarityMode.CANBERRA:
        return -canberra(embedding1, embedding2)
    elif mode == SimilarityMode.BRAY_CURTIS:
        return -braycurtis(embedding1, embedding2)
    elif mode == SimilarityMode.MAHALANOBIS:
        diff = np.array(embedding1) - np.array(embedding2)
        return -np.dot(diff, diff)
    elif mode == SimilarityMode.MINKOWSKI:
        return -np.linalg.norm(np.array(embedding1) - np.array(embedding2), ord=p)
    else:
        product = np.dot(embedding1, embedding2)
        norm = np.linalg.norm(embedding1) * np.linalg.norm(embedding2)
        return product / norm if norm != 0 else 0

In [None]:
import pandas as pd
dev = pd.read_csv('dev.csv')[['question', 'ground_truth']]
main = pd.read_csv('main.csv')[['question', 'ground_truth']]

In [None]:
embedding1 = model.embed_query('What is your name?')
embedding2 = model.embed_query('What names is your?')

for mode_name, mode_value in SimilarityMode.dict().items():
    print(mode_name)
    print(similarity(embedding1, embedding2, mode_value))

DEFAULT
0.8501373825198895
DOT_PRODUCT
0.8500870152612956
EUCLIDEAN
-0.5474556671078117
PEARSON
0.8501373957702609
TANIMOTO
0.7393380735457463
CANBERRA
-161.05231213625189
BRAY_CURTIS
-0.288078081677164
MAHALANOBIS
-0.2997077074484591
MINKOWSKI
-0.5474556671078117


In [None]:
Embedding = None
def get_top_k_embeddings(
    query_embedding: Embedding,
    embeddings: t.List[Embedding],
    mode,
    similarity_cutoff = 0.7,
    similarity_top_k = 3+1
) -> t.Tuple[t.List[float], t.List]:
    """
    Get top nodes by similarity to the query.
    returns the scores and the embedding_ids of the nodes
    """
    import heapq

    embedding_ids = list(range(len(embeddings)))

    embeddings_np = np.array(embeddings)
    query_embedding_np = np.array(query_embedding)

    similarity_heap: t.List[t.Tuple[float, t.Any]] = []
    for i, emb in enumerate(embeddings_np):
        similarity = similarity_fn(query_embedding_np, emb, mode)
        if similarity_cutoff is None or similarity > similarity_cutoff:
            heapq.heappush(similarity_heap, (similarity, embedding_ids[i]))
            if similarity_top_k and len(similarity_heap) > similarity_top_k:
                heapq.heappop(similarity_heap)
    result_tups = sorted(similarity_heap, key=lambda x: x[0], reverse=True)

    result_similarities = [s for s, _ in result_tups][1:]
    result_ids = [n for _, n in result_tups][1:]
    return result_similarities, result_ids

In [None]:
import pandas as pd
main = pd.read_csv('main.csv')[['question', 'ground_truth']]
dev = pd.read_csv('dev.csv')[['question', 'ground_truth']]

In [None]:
qm = list(main['question'].astype('str'))
embedqm = model.embed_documents(qm)
embedqm.__len__()

20

In [None]:
SimilarityMode.EUCLIDEAN

['euclidean', -0.5]

In [None]:
for i in range(20):
  x = get_top_k_embeddings(embedqm[i], embedqm, SimilarityMode.EUCLIDEAN[0], SimilarityMode.EUCLIDEAN[1])
  tems = [qm[j] for j in x[1] if i!=j]
  if tems:
    print()
    print(i, qm[i])
    print(x)
    print(tems)


0 What event is depicted in the photograph of Eisenhower welcoming Ethiopian Emperor Haile Selassie to the White House in May 1954?
([0.20163637877942941, 0.19352341164026712, 0.1874259439791411], [7, 14, 1])
["What led to the end of the Cold War, considering missed peace opportunities, strengthened alliances, the Berlin Wall, and Willy Brandt's Ostpolitik?", "How do limited wars relate to Churchill's post-war planning and the experiences of American decision makers, specifically regarding Roosevelt's mortality, the USSR, the Atomic bomb, Pearl Harbor, surprise attack, and overwhelming war versus limited wars?", 'What happened in the Yugoslav airliner crash in 1955?']

1 What happened in the Yugoslav airliner crash in 1955?
([0.5038169804644439, 0.43468137545667, 0.39301285875945013], [10, 5, 16])
['What were the consequences of using atomic bombs in Hiroshima and Nagasaki and how did the world react?', 'How did the nuclear era impact the definition of a winner in the Cold War?', "Wha

In [None]:
from langchain_core.pydantic_v1 import BaseModel, root_validator
import typing as t
Example = t.Dict[str, t.Any]
import json
class Prompt(BaseModel):
    """
    Prompt is a class that represents a prompt for the ragas metrics.

    Attributes:
        name (str): The name of the prompt.
        instruction (str): The instruction for the prompt.
        output_format_instruction (str): The output format instruction for the prompt.
        examples (List[Dict[str, Any]]): List of example inputs and outputs for the prompt.
        input_keys (List[str]): List of input variable names.
        output_key (str): The output variable name.
        output_type (Literal["json", "str"]): The type of the output (default: "json").
        language (str): The language of the prompt (default: "english").
    """

    name: str
    instruction: str
    output_format_instruction: str = ""
    examples: t.List[Example] = []
    input_keys: t.List[str]
    output_key: str
    output_type: t.Literal["json", "str"] = "json"
    language: str = "english"

    @root_validator
    def validate_prompt(cls, values: t.Dict[str, t.Any]) -> t.Dict[str, t.Any]:
        """
        Validate the template string to ensure that it is in desired format.
        """
        if values.get("instruction") is None or values.get("instruction") == "":
            raise ValueError("instruction cannot be empty")
        if values.get("input_keys") is None or values.get("instruction") == []:
            raise ValueError("input_keys cannot be empty")
        if values.get("output_key") is None or values.get("output_key") == "":
            raise ValueError("output_key cannot be empty")

        if values.get("examples"):
            output_key = values["output_key"]
            for no, example in enumerate(values["examples"]):
                for inp_key in values["input_keys"]:
                    if inp_key not in example:
                        raise ValueError(
                            f"example {no+1} does not have the variable {inp_key} in the definition"
                        )
                if output_key not in example:
                    raise ValueError(
                        f"example {no+1} does not have the variable {output_key} in the definition"
                    )
                if values["output_type"].lower() == "json":
                    try:
                        if output_key in example:
                            if isinstance(example[output_key], str):
                                json.loads(example[output_key])
                    except ValueError as e:
                        raise ValueError(
                            f"{output_key} in example {no+1} is not in valid json format: {e}"
                        )

        return values

    def to_string(self) -> str:
        """
        Generate the prompt string from the variables.
        """
        prompt_elements = [self.instruction]
        if self.output_format_instruction:
            prompt_elements.append(
                "\n"
                + self.output_format_instruction.replace("{", "{{").replace("}", "}}")
            )
        prompt_str = "\n".join(prompt_elements) + "\n"

        if self.examples:
            prompt_str += "\nExamples:\n"
            # Format the examples to match the Langchain prompt template
            for example in self.examples:
                for key, value in example.items():
                    is_json = isinstance(value, (dict, list))
                    value = (
                        json.dumps(value, ensure_ascii=False).encode("utf8").decode()
                    )
                    value = (
                        value.replace("{", "{{").replace("}", "}}")
                        if self.output_type.lower() == "json"
                        else value
                    )
                    prompt_str += (
                        f"\n{key}: {value}"
                        if not is_json
                        else f"\n{key}: ```{value}```"
                    )
                prompt_str += "\n"

        prompt_str += "\nYour actual task:\n"

        if self.input_keys:
            prompt_str += "".join(f"\n{key}: {{{key}}}" for key in self.input_keys)
        if self.output_key:
            prompt_str += f"\n{self.output_key}: \n"

        return prompt_str


In [None]:
question_answer_prompt = Prompt(
    name="answer_formulate",
    instruction="""Answer the question using the information from the given context. Output verdict as '1' if answer is present '-1' if answer is not present in the context.""",
    examples=[
        {
            "context": """Climate change is significantly influenced by human activities, notably the emission of greenhouse gases from burning fossil fuels. The increased greenhouse gas concentration in the atmosphere traps more heat, leading to global warming and changes in weather patterns.""",
            "question": "How do human activities contribute to climate change?",
            "answer": {
                "answer": "Human activities contribute to climate change primarily through the emission of greenhouse gases from burning fossil fuels. These emissions increase the concentration of greenhouse gases in the atmosphere, which traps more heat and leads to global warming and altered weather patterns.",
                "verdict": "1",
            },
        },
        {
            "context": """The concept of artificial intelligence (AI) has evolved over time, but it fundamentally refers to machines designed to mimic human cognitive functions. AI can learn, reason, perceive, and, in some instances, react like humans, making it pivotal in fields ranging from healthcare to autonomous vehicles.""",
            "question": "What are the key capabilities of artificial intelligence?",
            "answer": {
                "answer": "Artificial intelligence is designed to mimic human cognitive functions, with key capabilities including learning, reasoning, perception, and reacting to the environment in a manner similar to humans. These capabilities make AI pivotal in various fields, including healthcare and autonomous driving.",
                "verdict": "1",
            },
        },
        {
            "context": """The novel "Pride and Prejudice" by Jane Austen revolves around the character Elizabeth Bennet and her family. The story is set in the 19th century in rural England and deals with issues of marriage, morality, and misconceptions.""",
            "question": "What year was 'Pride and Prejudice' published?",
            "answer": {
                "answer": "The answer to given question is not present in context",
                "verdict": "-1",
            },
        },
    ],
    input_keys=["context", "question"],
    output_key="answer",
    output_type="json",
    language="english",
)

keyphrase_extraction_prompt = Prompt(
    name="keyphrase_extraction",
    instruction="Extract the top 3 to 5 keyphrases from the provided text, focusing on the most significant and distinctive aspects. ",
    examples=[
        {
            "text": "A black hole is a region of spacetime where gravity is so strong that nothing, including light and other electromagnetic waves, has enough energy to escape it. The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole.",
            "output": {
                "keyphrases": [
                    "Black hole",
                    "Region of spacetime",
                    "Strong gravity",
                    "Light and electromagnetic waves",
                    "Theory of general relativity",
                ]
            },
        },
        {
            "text": "The Great Wall of China is an ancient series of walls and fortifications located in northern China, built around 500 years ago. This immense wall stretches over 13,000 miles and is a testament to the skill and persistence of ancient Chinese engineers.",
            "output": {
                "keyphrases": [
                    "Great Wall of China",
                    "Ancient fortifications",
                    "Northern China",
                ]
            },
        },
    ],
    input_keys=["text"],
    output_key="output",
    output_type="json",
)


seed_question_prompt = Prompt(
    name="seed_question",
    instruction="Generate a question that can be fully answered from given context. The question should be formed using topic",
    examples=[
        {
            "context": "Photosynthesis in plants involves converting light energy into chemical energy, using chlorophyll and other pigments to absorb light. This process is crucial for plant growth and the production of oxygen.",
            "keyphrase": "Photosynthesis",
            "question": "What is the role of photosynthesis in plant growth?",
        },
        {
            "context": "The Industrial Revolution, starting in the 18th century, marked a major turning point in history as it led to the development of factories and urbanization.",
            "keyphrase": "Industrial Revolution",
            "question": "How did the Industrial Revolution mark a major turning point in history?",
        },
        {
            "context": "The process of evaporation plays a crucial role in the water cycle, converting water from liquid to vapor and allowing it to rise into the atmosphere.",
            "keyphrase": "Evaporation",
            "question": "Why is evaporation important in the water cycle?",
        },
    ],
    input_keys=["context", "keyphrase"],
    output_key="question",
    output_type="str",
)

In [None]:
print(question_answer_prompt.to_string())

Answer the question using the information from the given context. Output verdict as '1' if answer is present '-1' if answer is not present in the context.

Examples:

context: "Climate change is significantly influenced by human activities, notably the emission of greenhouse gases from burning fossil fuels. The increased greenhouse gas concentration in the atmosphere traps more heat, leading to global warming and changes in weather patterns."
question: "How do human activities contribute to climate change?"
answer: ```{{"answer": "Human activities contribute to climate change primarily through the emission of greenhouse gases from burning fossil fuels. These emissions increase the concentration of greenhouse gases in the atmosphere, which traps more heat and leads to global warming and altered weather patterns.", "verdict": "1"}}```

context: "The concept of artificial intelligence (AI) has evolved over time, but it fundamentally refers to machines designed to mimic human cognitive fun

In [None]:
print(keyphrase_extraction_prompt.to_string())

Extract the top 3 to 5 keyphrases from the provided text, focusing on the most significant and distinctive aspects. 

Examples:

text: "A black hole is a region of spacetime where gravity is so strong that nothing, including light and other electromagnetic waves, has enough energy to escape it. The theory of general relativity predicts that a sufficiently compact mass can deform spacetime to form a black hole."
output: ```{{"keyphrases": ["Black hole", "Region of spacetime", "Strong gravity", "Light and electromagnetic waves", "Theory of general relativity"]}}```

text: "The Great Wall of China is an ancient series of walls and fortifications located in northern China, built around 500 years ago. This immense wall stretches over 13,000 miles and is a testament to the skill and persistence of ancient Chinese engineers."
output: ```{{"keyphrases": ["Great Wall of China", "Ancient fortifications", "Northern China"]}}```

Your actual task:

text: {text}
output: 



In [None]:
print(seed_question_prompt.to_string())

Generate a question that can be fully answered from given context. The question should be formed using topic

Examples:

context: "Photosynthesis in plants involves converting light energy into chemical energy, using chlorophyll and other pigments to absorb light. This process is crucial for plant growth and the production of oxygen."
keyphrase: "Photosynthesis"
question: "What is the role of photosynthesis in plant growth?"

context: "The Industrial Revolution, starting in the 18th century, marked a major turning point in history as it led to the development of factories and urbanization."
keyphrase: "Industrial Revolution"
question: "How did the Industrial Revolution mark a major turning point in history?"

context: "The process of evaporation plays a crucial role in the water cycle, converting water from liquid to vapor and allowing it to rise into the atmosphere."
keyphrase: "Evaporation"
question: "Why is evaporation important in the water cycle?"

Your actual task:

context: {con

In [None]:
find_relevant_context_prompt = Prompt(
    name="find_relevant_context",
    instruction="Given a question and set of contexts, find the most relevant contexts to answer the question.",
    examples=[
        {
            "question": "What is the capital of France?",
            "contexts": [
                "1. France is a country in Western Europe. It has several cities, including Paris, Lyon, and Marseille. Paris is not only known for its cultural landmarks like the Eiffel Tower and the Louvre Museum but also as the administrative center.",
                "2. The capital of France is Paris. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum.",
                "3. Paris is the capital of France. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum.",
            ],
            "output": {
                "relevant_contexts": [1, 2],
            },
        },
        {
            "question": "How does caffeine affect the body and what are its common sources?",
            "contexts": [
                "1. Caffeine is a central nervous system stimulant. It can temporarily ward off drowsiness and restore alertness. It primarily affects the brain, where it alters the function of neurotransmitters.",
                "2. Regular physical activity is essential for maintaining good health. It can help control weight, combat health conditions, boost energy, and promote better sleep.",
                "3. Common sources of caffeine include coffee, tea, cola, and energy drinks. These beverages are consumed worldwide and are known for providing a quick boost of energy.",
            ],
            "output": {"relevant_contexts": [1, 2]},
        },
    ],
    input_keys=["question", "contexts"],
    output_key="output",
    output_type="json",
    language="english",
)
print(find_relevant_context_prompt.to_string())

Given a question and set of contexts, find the most relevant contexts to answer the question.

Examples:

question: "What is the capital of France?"
contexts: ```["1. France is a country in Western Europe. It has several cities, including Paris, Lyon, and Marseille. Paris is not only known for its cultural landmarks like the Eiffel Tower and the Louvre Museum but also as the administrative center.", "2. The capital of France is Paris. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum.", "3. Paris is the capital of France. It is also the most populous city in France, with a population of over 2 million people. Paris is known for its cultural landmarks like the Eiffel Tower and the Louvre Museum."]```
output: ```{{"relevant_contexts": [1, 2]}}```

question: "How does caffeine affect the body and what are its common sources?"
contexts: ```["1. Caffeine is a central 