In [1]:
import openai
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import os

tqdm.pandas()
load_dotenv()

True

## Load QA data

In [573]:
qa_df = pd.read_csv('data/sample.tsv', sep='\t')

In [574]:
qa_df.drop(columns=['gpt-4o','gpt-4','gpt-3.5-turbo'], inplace=True)
qa_df.to_csv('data/sample.tsv',sep='\t',index=False)

## Zero-shot QA prompt

In [65]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-4o")

In [572]:
from langchain_anthropic import ChatAnthropic

chat_model = ChatAnthropic(model="claude-3-5-sonnet-20240620")

In [67]:
from langchain_google_vertexai import ChatVertexAI

chat_model = ChatVertexAI(model="gemini-1.5-pro")



In [68]:
from langchain_mistralai import ChatMistralAI

chat_model = ChatMistralAI(model="mistral-large-latest", api_key= os.environ['MISTRAL_API_KEY'])

In [69]:
from langchain_fireworks import ChatFireworks

chat_model = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", api_key=os.environ['FIREWORKS_API_KEY'])

In [13]:
from langchain_core.prompts.chat import ChatPromptTemplate

template = "त्वया संस्कृत-भाषायाम् एव वक्तव्यम्। न तु अन्यासु भाषासु। अधः रामायण-सम्बन्धे पृष्ट-प्रश्नस्य प्रत्युत्तरं देहि। तदपि एकेनैव पदेन, यावद् लघु शक्यं तावद्, तं पुनः विवृतम् मा कुरु।"
human_template = "{question}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

def output_parse(ai_message):
    return ai_message.content.replace('।','').strip()
    
zeroshot_chain = chat_prompt | chat_model | output_parse

In [14]:
question = qa_df['QUESTION'][9]
zeroshot_chain.invoke({'question': question})

'Valmiki'

In [7]:
working_df = pd.read_csv('results/predictions.tsv', sep='\t')
working_df['gpt-4o-mini'] = working_df.progress_apply(lambda x: zeroshot_chain.invoke({"question": x['QUESTION']}), axis=1)

100%|███████████████████████████████████████████| 60/60 [01:05<00:00,  1.10s/it]


In [8]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)
scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.383,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-mini': 0.017}

In [60]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

## RAG prompt

In [233]:
os.environ['LANGCHAIN_TRACING_V2'] = 'false'

In [575]:
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader

# Load Documents
loader = TextLoader(file_path='data/ref/rAmAyaNa_dev.txt')
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=90)
splits = text_splitter.split_documents(docs)

In [576]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration
from indic_transliteration.sanscript import IAST, DEVANAGARI, transliterate
import torch
import regex as re

checkpoint = 'mahesh27/t5lemmatizer'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
lemmatizer_model = T5ForConditionalGeneration.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

def lemmatize(chunk, translate_only=False):
    text = chunk.replace('\n\n','\n').split('\n')
    pattern = re.compile(r'[0-9]+,[0-9]+\|[0-9]+')
    lines = [pattern.sub('',transliterate(line, DEVANAGARI, IAST)) for line in text]
    if translate_only:
        return ' '.join(lines)
    tokenized_text = [{'input_ids': tokenizer(line)['input_ids'] + [tokenizer.eos_token_id]} for line in lines]
    
    inputs = data_collator(tokenized_text)
    with torch.no_grad():
        outputs = lemmatizer_model.generate(inputs=inputs['input_ids'], max_length=64)
    
    return ' '.join(tokenizer.batch_decode(outputs, skip_special_tokens=True))

In [None]:
for split in tqdm(splits):
    split.metadata['text'] = split.page_content
    split.page_content = lemmatize(split.page_content, translate_only=False)

In [237]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma

In [86]:
#### custom Embeddings ####
import importlib
from typing import Optional, cast, List
from itertools import batched
import numpy as np
import numpy.typing as npt
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import math


class TransformerEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
            self,
            model_name: str = "mahesh27/vedicberta-base",
            cache_dir: Optional[str] = None,
    ):
        try:
            from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding

            self._torch = importlib.import_module("torch")
            self._tokenizer = AutoTokenizer.from_pretrained(model_name)
            print(self._tokenizer.pad_token_id)
            self._model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
            self._datacollator = DataCollatorWithPadding(tokenizer=self._tokenizer, return_tensors='pt')
            self._max_length = self._model.config.max_position_embeddings
            self._eps = 1e-5
        except ImportError:
            raise ValueError(
                "The transformers and/or pytorch python package is not installed. Please install it with "
                "`pip install transformers` or `pip install torch`"
            )

    @staticmethod
    def _normalize(vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm

    def __call__(self, input: Documents) -> Embeddings:
        tokenized_text = [self._tokenizer(text, truncation=True, max_length=self._max_length) for text in input]
        inputs = self._datacollator(tokenized_text)
        with self._torch.no_grad():
            outputs = self._model(**inputs)
        embeddings = (outputs.last_hidden_state*inputs['attention_mask'].unsqueeze(-1)).sum(dim=1)/(inputs['attention_mask'].sum(dim=1).unsqueeze(1)+self._eps)  # mean pooling
        return [e.tolist() for e in self._normalize(embeddings)]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        batch_length = 4
        out = []
        for batch in tqdm(batched(texts, batch_length), total = math.ceil(len(texts)//batch_length)):
            out += self(batch)
        
        return out

    def embed_query(self, text: str) -> List[float]:
        return self([text])[0]

In [238]:
import importlib
from typing import Optional, cast, List
import numpy as np
import numpy.typing as npt
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import fasttext as ft
from gensim.models import KeyedVectors as kv

class VectorEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
            self,
            model_name: str = "gl",
            cache_dir: Optional[str] = None,
    ):
        self._model_name = model_name
        if model_name == "gl":
            self._model = kv.load_word2vec_format("sa_embedding/models/glove/vectors.vec", binary=False)
            self._word_vec = lambda x: self._model[x]
        elif model_name == "ft":
            self._model = ft.load_model("sa_embedding/models/fasttext/vectors.bin")
            self._word_vec = lambda x: self._model.get_word_vector(x)
    
    
    @staticmethod
    def _normalize(vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm
        
    def sentence2vec(self, sentence: str) -> npt.NDArray:
        words = sentence.split()
        word_vectors = []
        
        for word in words:
            if self._model_name == 'ft' or word in self._model:
                word_vectors.append(self._word_vec(word))
        
        if not word_vectors:
            return np.zeros(self._model.vector_size)
        
        # Compute the average vector
        avg_vector = np.mean(word_vectors, axis=0)
        
        return avg_vector


    def __call__(self, input: Documents) -> Embeddings:
        embeddings = []
        for text in input:
            embeddings.append(self.sentence2vec(text))
        return [e.tolist() for e in self._normalize(embeddings)]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self(texts)

    def embed_query(self, text: str) -> List[float]:
        return self([text])[0]

In [239]:
embedding = VectorEmbeddingFunction(model_name='ft')
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

In [240]:
#### INDEXING with BM25 ####
retriever = BM25Retriever.from_documents(splits)

In [241]:
#question = qa_df['QUESTION'][26]
question = 'गवाक्षस्य सेनायाः का जातिः?'

In [242]:
docs = retriever.invoke(lemmatize(question, translate_only=False))
len(docs)

4

In [244]:
docs[0].metadata['text']

'श्रुत्वा हनूमतो वाक्यं यथावदनुपूर्वशः\nततोऽब्रवीन्महातेजा रामः सत्यपराक्रमः ६,००४।००१\n\nयां निवेदयसे लङ्कां पुरीं भीमस्य रक्षसः\nक्षिप्रमेनां वधिष्यामि सत्यमेतद्ब्रवीमि ते ६,००४।००२\n\nअस्मिन्मुहूर्ते सुग्रीव प्रयाणमभिरोचये\nयुक्तो मुहूर्तो विजयः प्राप्तो मध्यं दिवाकरः ६,००४।००३\n\nउत्तरा फल्गुनी ह्यद्य श्वस्तु हस्तेन योक्ष्यते\nअभिप्रयाम सुग्रीव सर्वानीकसमावृताः ६,००४।००४\n\nनिमित्तानि च धन्यानि यानि प्रादुर्भवन्ति मे\nनिहत्य रावणं सीतामानयिष्यामि जानकीम् ६,००४।००५\n\nउपरिष्टाद्धि नयनं स्फुरमाणमिदं मम\nविजयं समनुप्राप्तं शंसतीव मनोरथम् ६,००४।००६\n\nअग्रे यातु बलस्यास्य नीलो मार्गमवेक्षितुम्\nवृतः शतसहस्रेण वानराणां तरस्विनाम् ६,००४।००७\n\nफलमूलवता नील शीतकाननवारिणा\nपथा मधुमता चाशु सेनां सेनापते नय ६,००४।००८\n\nदूषयेयुर्दुरात्मानः पथि मूलफलोदकम्\nराक्षसाः परिरक्षेथास्तेभ्यस्त्वं नित्यमुद्यतः ६,००४।००९\n\nनिम्नेषु वनदुर्गेषु वनेषु च वनौकसः\nअभिप्लुत्याभिपश्येयुः परेषां निहतं बलम् ६,००४।०१०\n\nसागरौघनिभं भीममग्रानीकं महाबलाः\nकपिसिंहा प्रकर्षन्तु शतशोऽथ सहस्रशः ६,००४।०११\n\nगजश्च गिरि

In [245]:
#### RETRIEVAL and GENERATION ####
# Prompt 
template = """त्वया संस्कृत-भाषायाम् एव वक्तव्यम्। न तु अन्यासु भाषासु। अधः रामायण-सम्बन्धे पृष्ट-प्रश्नस्य प्रत्युत्तरं देहि। तदपि एकेनैव पदेन, यावद् लघु शक्यं तावद्, तं पुनः विवृतम् मा कुरु। अपि च यथाऽवश्यम् अधः दत्त-सन्दर्भेभ्यः सहाय्यं गृहाण। तत्तु सर्वदा साधु इति नाऽस्ति प्रतीतिः।
     सन्दर्भाः:{context}
     प्रश्नः:{question}
    """
prompt = PromptTemplate.from_template(template)
# LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
#llm = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0)
#llm = ChatVertexAI(model_name="gemini-1.5-pro", temperature=0)
#llm = ChatMistralAI(model_name="mistral-large-latest", api_key= os.environ['MISTRAL_API_KEY'], temperature=0)
#llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-405b-instruct", api_key=os.environ['FIREWORKS_API_KEY'], temperature=0, max_tokens=20)

# Post-processing
def format_docs(docs):
    return '\n\n'.join([doc.metadata['text'] for doc in docs[:2]])

# Chain
rag_chain = (
    {"context": RunnableLambda(lambda x: lemmatize(x, translate_only=True)) | retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser() 
    | RunnableLambda(lambda x: x.replace('।','').strip())
)


rag_chain.invoke(question)

'वानरः'

In [246]:
working_df['gpt-4o-RAG-restricted'] = working_df.progress_apply(lambda x: rag_chain.invoke(x['QUESTION']), axis=1)

100%|███████████████████████████████████████████| 60/60 [00:54<00:00,  1.11it/s]


In [247]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)

scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.483,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-RAG-ft': 0.567,
 'gpt-4o-RAG-gl': 0.433,
 'gpt-4o-KG-RAG-responses': 0.0,
 'gpt-4o-KG-RAG-query': 0.0,
 'gpt-4o-RAG-restricted': 0.533}

In [50]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

In [74]:
rag_chain.invoke("किं नम सरोवरः यः समन्तान् योजनं विस्तीर्य तिष्ठति?")

'सुरसा'

## KG RAG

### Curate KG

In [426]:
nodes_df = pd.read_csv('data/kg/sample/nodes.csv')
relationships_df = pd.read_csv('data/kg/sample/edges.csv')

In [427]:
relationships_df.drop(columns=['annotator', 'line_id', 'line_text', 'detail'], inplace=True)

In [428]:
nodes_df.drop(columns=['annotator', 'line_id', 'line_text'], inplace=True)

In [429]:
nodes_df[':LABEL'] = nodes_df.apply(lambda x: x[':LABEL'].split('/')[0].capitalize().replace(' ','_'), axis=1)
nodes_df

Unnamed: 0,:ID,:LABEL,lemma
0,1,Town,अयोध्या
1,2,Kingdom,कोसल
2,3,River,शरयू
3,4,Human,अंशुमान
4,5,Human,राम
...,...,...,...
673,708,Loka,पितृलोक
674,709,Demon,पिशाच
675,710,Rakshasa,पिशाच 1
676,711,Nymph,पुण्डरीका


In [430]:
relationships_df[':TYPE'] = relationships_df.apply(lambda x: '_'.join(x[':TYPE'].replace('-', ' ').upper().split()), axis=1)
relationships_df

Unnamed: 0,:START_ID,:TYPE,:END_ID
0,1,IS_CAPITAL_OF,2
1,1,IS_SITUTATED_ON_THE_BANKS_OF,3
2,64,IS_FATHER_OF,21
3,21,IS_FATHER_OF,22
4,22,IS_FATHER_OF,24
...,...,...,...
712,707,IS_A_DOOR_KIPPER_OF,260
713,708,IS_SITUATED_NEAR,290
714,205,HAS_INVOKED_THE_PROTECTION_OF,709
715,128,WAS_ACCOMPANIED_BY,710


In [431]:
nodes_df.to_csv('data/kg/nodes_processed.csv',index=False)
relationships_df.to_csv('data/kg/relationships_processed.csv',index=False)

### Load KG

In [134]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph()
graph.get_schema

'Node properties:\nTOWN {lemma: STRING}\nKINGDOM {lemma: STRING}\nRIVER {lemma: STRING}\nHUMAN {lemma: STRING}\nVILLAGE {lemma: STRING}\nRAKSHASA {lemma: STRING}\nDEVA {lemma: STRING}\nRISHI {lemma: STRING}\nEPITHET {lemma: STRING}\nKULA {lemma: STRING}\nMOUNTAIN {lemma: STRING}\nGANA {lemma: STRING}\n\u200d\u200dQUALITY {lemma: STRING}\nUNIT {lemma: STRING}\nROAD {lemma: STRING}\nPROFESSIONAL {lemma: STRING}\nWEAPON {lemma: STRING}\nGOD {lemma: STRING}\nMONKEY {lemma: STRING}\nKING {lemma: STRING}\nPRINCE {lemma: STRING}\nCAPITAL {lemma: STRING}\nCITY {lemma: STRING}\nRAKSHASHI {lemma: STRING}\nPRAJAPATI {lemma: STRING}\nHILL {lemma: STRING}\nELEPHANT {lemma: STRING}\nSHAPE {lemma: STRING}\nFOREST {lemma: STRING}\nPART OF A SEA {lemma: STRING}\nAPSARA {lemma: STRING}\nGODDESS {lemma: STRING}\nDANAVA {lemma: STRING}\nCOUNTRY {lemma: STRING}\nMESSENGER {lemma: STRING}\nMAHARSHI {lemma: STRING}\nTREE {lemma: STRING}\nINSECT {lemma: STRING}\nANIMAL {lemma: STRING}\nDEVINE BIRD {lemma: STR

In [327]:
def get_schema(G, structured=False):
    schema = G.get_structured_schema

    lines = []
    
    nodes = schema['node_props']
    node_list = list(nodes.keys())
    lines.append(f"All nodes have properties:\n{{{nodes[node_list[0]][0]['property']}:{nodes[node_list[0]][0]['type']}}}")
    lines.append(f"Node labels:\n{', '.join(node_list)}")
    
    edges = schema['relationships']
    relationship_list = [e['type'] for e in edges]
    relationship_list = list(set(relationship_list))
    ontology = {key: {'start':[], 'end':[]} for key in relationship_list}
    for e in edges:
        rel = e['type']
        for n in ['start', 'end']:
            if e[n] not in ontology[rel][n]:
                ontology[rel][n].append(e[n])


    ontology_list = [f"({'|'.join([':'+n for n in ends['start']])})-[:{rel}]->({'|'.join([':'+n for n in ends['end']])})" for rel, ends in ontology.items()]
    lines.append(f"Relationships:\n{'\n'.join(ontology_list)}")
    if structured:
        return {'ontology': ontology, 'nodes': node_list}
        
    return '\n\n'.join(lines)

### Entity Extraction

In [156]:
from typing import List, Optional

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field

llm = ChatOpenAI(model="gpt-4o", temperature=0)
#llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0)
#llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-405b-instruct", api_key=os.environ['FIREWORKS_API_KEY'], temperature=0)

class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the named entities appearing in the text",
    )



prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are extracting named entities from the text to query from a knowledge graph. The entities should be lemmatized i.e. in prātipadika form.",
        ),
        (
            "human",
            "Use the given format to extract information from the following."
            "input: {question}",
        ),
    ]
)


entity_chain = prompt | llm.with_structured_output(Entities)

In [442]:
#question = qa_df['QUESTION'][27]
question = 'कस्याः नद्यः तिरे पञ्चवटी अस्ति यत्र सह पत्न्या सह भ्रात्रा च रामोऽवसत्?'
question

'कस्याः नद्यः तिरे पञ्चवटी अस्ति यत्र सह पत्न्या सह भ्रात्रा च रामोऽवसत्?'

In [443]:
entities = entity_chain.invoke({"question": question})
entities

Entities(names=['नदी', 'पञ्चवटी', 'पत्नी', 'भ्रातृ', 'राम'])

In [523]:
label_query = """MATCH (p)
WHERE p.lemma = $value
RETURN p.lemma AS result, labels(p)[0] AS type
"""

def map_to_database(entities: Entities) -> Optional[str]:
    result = ""
    for entity in entities.names:
        responses = graph.query(label_query, {"value": entity})
        for response in responses:
            result += f"{entity} maps to {response['result']} {response['type']} in database\n"
            mapped_entity = {'type': response['type'], 'lemma': response['result']}

            edge_outward = f"""MATCH (p:{mapped_entity['type']})-[r]->(q)
            WHERE p.lemma = '{mapped_entity['lemma']}'
            RETURN r AS relationship, labels(q)[0] AS type
            """
            
            edge_inward =f"""MATCH (p:{mapped_entity['type']})<-[r]-(q)
            WHERE p.lemma = '{mapped_entity['lemma']}'
            RETURN r AS relationship, labels(q)[0] AS type
            """

            def prune_edges(edges):
                out = {}
                for edge in edges:
                    rel = edge['relationship'][1]
                    if rel not in out:
                        out[rel] = []
                    out[rel].append(edge['type'])
                return out
                
            outward_edges = graph.query(edge_outward)
            result += f"Outgoing relationships from :{response['type']}{{lemma: '{response['result']}'}} are:\n"
            for e, t in prune_edges(outward_edges).items():
                result += f"-[:{e}->({'|'.join([':'+l for l in t])})]\n"
                
            inward_edges = graph.query(edge_inward)
            result += f"Incoming relationships to :{response['type']}{{lemma: '{response['result']}'}} are:\n"
            for e, t in prune_edges(inward_edges).items():
                result += f"<-[:{e}-({'|'.join([':'+l for l in t])})]\n"

            result += '\n'
    
    return result

### Query Generation

In [538]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Generate Cypher statement based on natural language input
cypher_template = """Based on the extracted entities and the Neo4j graph ontology involving the extracted entities given below, 
write a Cypher query that would answer the user's question. Do not use variable names in Sanskrit (Devanagari).
only lemmas are in Sanskrit.

Entities in the question map to the following database values and their respective allowed ontologies:
All nodes have properties: {{lemma: STRING}}
{entities_list}

Example:
Question: "रामस्य पिता कः?", given the entities "राम maps to राम Human in database", 
Cypher query:"MATCH (father)-[:IS_FATHER_OF]->(son:Human {{lemma: 'राम'}})\nRETURN father.lemma AS father\n", 

Since the knowledge graph is incomplete, give minimalistic queries with minimum entities. Thus generate cypher query to the question below:

Question: {question}
Cypher query:"""  # noqa: E501

cypher_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Given an input question, convert it to a Cypher query. Do not give any preamble.",
        ),
        ("human", cypher_template),
    ]
)

cypher_response = (
    RunnablePassthrough.assign(names=entity_chain)
    | RunnablePassthrough.assign(
        entities_list=lambda x: map_to_database(x["names"]),
        #schema=lambda _: get_schema(graph),
    )
    | cypher_prompt
    | llm.bind(stop=["\nCypherResult:"])
    | StrOutputParser()
)

In [542]:
cypher = cypher_response.invoke({"question": "कस्याः नद्यः तिरे पञ्चवटी अस्ति यत्र सह पत्न्या सह भ्रात्रा च रामोऽवसत्?"})
print(cypher)

```cypher
MATCH (forest:Forest {lemma: 'पञ्चवटी'})<-[:FLOWED_BY_THE]-(river:River)
RETURN river.lemma AS river
```


In [545]:
graph.query(cypher.replace('```','').replace('cypher\n',''))

[{'river': 'गोदावरी'}]

In [555]:
from langchain.chains.graph_qa.cypher_utils import CypherQueryCorrector, Schema

# Cypher validation tool for relationship directions
corrector_schema = [
    Schema(el["start"], el["type"], el["end"])
    for el in graph.structured_schema.get("relationships")
]
cypher_validation = CypherQueryCorrector(corrector_schema)

# Generate natural language response based on database results
response_template = """Mould the Cypher response in to proper case/vibhakti (not in prātipadika form) appropriate to the question given below. 
Final answer should remain in single word.
Question: {question}
Cypher Response: {response}"""  # noqa: E501

response_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the input question taking support of cypher response if not empty. No preamble",
        ),
        ("human", response_template),
    ]
)


response_chain = response_prompt | llm | StrOutputParser()
def run_query(query_str):
    try:
        response = graph.query(query_str.replace('```','').replace('cypher\n',''))
    except:
        response = ''
    if type(response) is list:
        if len(response) > 0:
            response = response[0]
            if type(response) is dict:
                for key, item in response.items():
                    response = item
                    break
            elif type(response) is not str:
                response = ''
        else:
            response = ''
    return response
        
kgrag_chain = (
    RunnablePassthrough.assign(query=cypher_response)
    | RunnablePassthrough.assign(
        response=lambda x: run_query(x["query"]),
    )
    | RunnablePassthrough.assign(answer=response_chain)
)

In [557]:
def kg_rag_answer(question):
    res = kgrag_chain.invoke({"question": question})
    if res['response'] != '':
        return res['answer']
    else:
        return ''
kg_rag_answer("का सा राक्षसी या उदधेः उपरि प्लवन्तं हनूमन्तं निवार्य तस्य भक्षणे च मतिं चकार?")



''

In [558]:
working_df = pd.read_csv('results/predictions.tsv', sep='\t')

In [559]:
working_df['gpt-4o-KG-RAG-responses'] = working_df.progress_apply(lambda x: kg_rag_answer(x['QUESTION']), axis=1)

100%|███████████████████████████████████████████| 60/60 [03:42<00:00,  3.70s/it]


In [560]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)

scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.483,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-RAG-ft': 0.567,
 'gpt-4o-RAG-gl': 0.433,
 'gpt-4o-KG-RAG-responses': 0.133,
 'gpt-4o-KG-RAG-query': 0.0}

In [561]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

In [192]:
working_df['gpt-4o-KG-RAG'][0]

'पुष्पके'

### Evaluating generated cypher query quality

In [562]:
cypher_queries = working_df['gpt-4o-KG-RAG-query']

In [563]:
query_text = cypher_queries[0]

In [564]:
import regex as re

edge_forward = re.compile(r"\((.+?)\)\-\[:(\S+?)\]\->\((.+?)\),?")
edge_backward = re.compile(r"\((.+?)\)<\-\[:(\S+?)\]\-\((.+?)\),?")
label_pattern0 = re.compile(r"\S*?:(.+)")
label_pattern1 = re.compile(r"(\S+)\s*\{lemma:\s*(.+)\}")

In [565]:
match = edge_backward.findall(query_text)

In [566]:
query_text

"```cypher\nMATCH (राम:Human {lemma: 'राम'})-[:RETURNED_TO]->(अयोध्या:Town {lemma: 'अयोध्या'})\nRETURN राम.lemma AS राम, अयोध्या.lemma AS अयोध्या\n```"

In [567]:
match

[]

In [568]:
def label_parse(node):
    if ':' not in node:
        return {'label': '', 'lemma': ''}
    outer_match = label_pattern0.findall(node)[0]

    if 'lemma' not in outer_match:
        return {'label': outer_match, 'lemma': ''}

    label, lemma = label_pattern1.findall(outer_match)[0]
    return {'label': label, 'lemma': lemma.replace("'",'')}

In [569]:
relationships = []
nodes = []

for query in tqdm(cypher_queries):
    match_forward = edge_forward.findall(query)
    match_backward = edge_backward.findall(query)

    for match in match_forward:
        start = label_parse(match[0])
        end = label_parse(match[2])

        relationships.append({'start': start['label'], 'relationship': match[1], 'end': end['label']})
        if start['label'] != '' and start['lemma'] != '':
            nodes.append(start)
        if end['label'] != '' and end['lemma'] != '':
            nodes.append(end)

    for match in match_backward:
        start = label_parse(match[2])
        end = label_parse(match[0])

        relationships.append({'start': start['label'], 'relationship': match[1], 'end': end['label']})
        if start['label'] != '' and start['lemma'] != '':
            nodes.append(start)
        if end['label'] != '' and end['lemma'] != '':
            nodes.append(end)

100%|████████████████████████████████████████| 60/60 [00:00<00:00, 94254.02it/s]


In [570]:
### Verify nodes
valid_label = []
label_lemma = []
valid_lemma = []

schema = get_schema(graph, structured=True)
for node in nodes:
    if node['label'] in schema['nodes']:
        valid_label.append(1)
    else:
        valid_label.append(0)

    if node['lemma'] != '':
        lemma_match = graph.query(f"""MATCH (p)
                        WHERE p.lemma CONTAINS '{node['lemma']}'
                        RETURN p.lemma AS result, labels(p)[0] AS type
                        LIMIT 1
                        """)
        if len(lemma_match) == 0:
            valid_lemma.append(0)
        else:
            valid_lemma.append(1)
            if lemma_match[0]['type'] == node['label']:
                label_lemma.append(1)
            else:
                label_lemma.append(0)

valid_ontology = []
valid_relationship = []
valid_start_type = []
valid_end_type = []
for edge in relationships:
    rel, start, end = edge['relationship'], edge['start'], edge['end']
    valid_ontology.append((rel in list(schema['ontology'].keys())) and 
                          (start=='' or start in schema['ontology'][rel]['start']) and
                          (end=='' or end in schema['ontology'][rel]['end']))
    valid_relationship.append(rel in list(schema['ontology'].keys()))

    if rel in list(schema['ontology'].keys()):
        valid_start_type.append(start=='' or start in schema['ontology'][rel]['start'])
        valid_end_type.append(end=='' or end in schema['ontology'][rel]['end'])

results = {'valid_label': np.mean(valid_label), 'valid_lemma': np.mean(valid_lemma), 
           'valid_label_lemma': np.mean(label_lemma), 'valid_ontology': np.mean(valid_ontology),
           'valid_relationship': np.mean(valid_relationship), 'valid_start_type': np.mean(valid_start_type),
           'valid_end_type': np.mean(valid_end_type)}

for key, val in results.items():
    results[key] = round(val, 2)

results

{'valid_label': 0.0,
 'valid_lemma': 0.78,
 'valid_label_lemma': 0.94,
 'valid_ontology': 0.02,
 'valid_relationship': 0.69,
 'valid_start_type': 0.64,
 'valid_end_type': 0.02}

In [571]:
edge

{'start': '', 'relationship': 'IS_FATHER_OF', 'end': 'Human'}