In [1056]:
import openai
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import os

tqdm.pandas()
load_dotenv(override=True)

True

## Load QA data

In [579]:
qa_df = pd.read_csv('data/sample.tsv', sep='\t')

In [574]:
qa_df.drop(columns=['gpt-4o','gpt-4','gpt-3.5-turbo'], inplace=True)
qa_df.to_csv('data/sample.tsv',sep='\t',index=False)

In [580]:
qa_df

Unnamed: 0,QUESTION,ANSWER
0,यस्मिन् विमाने उपाविश्य रामः स्वपत्न्या भ्रात्...,पुष्पकम्
1,का राम-वनप्रस्थानाय भरत-राज्याभिषेकाय च कैकेयी...,मन्थरा
2,विश्वामित्रो यदा क्षत्रियः आसीत् तदा तस्य नाम ...,विश्वजित्
3,विश्वामित्रस्य तपसो विघ्नः कस्याः अप्सरसः जज्ञे?,मेनकायाः
4,देवासुरैः निर्मथित-क्षीरोदधेः योऽश्वो जातः तस्...,उच्चैःश्रवाः
5,का सा राक्षसी या उदधेः उपरि प्लवन्तं हनूमन्तं ...,सुरसा
6,आत्मनः पुत्रम् इच्छन् दशरथः केनेष्टिना याजयामास?,पुत्रेकामेष्ट्या
7,विदेहाधिपतेः जनकस्य पुरोहितः कः?,शतानन्दः
8,जनकस्य कनीयसो भ्रातुः नाम किम्?,कुशध्वजः
9,कः स ऋषिः यः समुद्रान् अपिबत्?,अगस्त्यः


## Zero-shot QA prompt

In [65]:
from langchain_openai import ChatOpenAI

chat_model = ChatOpenAI(model="gpt-4o")

In [572]:
from langchain_anthropic import ChatAnthropic

chat_model = ChatAnthropic(model="claude-3-5-sonnet-20240620")

In [67]:
from langchain_google_vertexai import ChatVertexAI

chat_model = ChatVertexAI(model="gemini-1.5-pro")



In [68]:
from langchain_mistralai import ChatMistralAI

chat_model = ChatMistralAI(model="mistral-large-latest", api_key= os.environ['MISTRAL_API_KEY'])

In [69]:
from langchain_fireworks import ChatFireworks

chat_model = ChatFireworks(model="accounts/fireworks/models/llama-v3p1-8b-instruct", api_key=os.environ['FIREWORKS_API_KEY'])

In [13]:
from langchain_core.prompts.chat import ChatPromptTemplate

template = "त्वया संस्कृत-भाषायाम् एव वक्तव्यम्। न तु अन्यासु भाषासु। अधः रामायण-सम्बन्धे पृष्ट-प्रश्नस्य प्रत्युत्तरं देहि। तदपि एकेनैव पदेन, यावद् लघु शक्यं तावद्, तं पुनः विवृतम् मा कुरु।"
human_template = "{question}"

chat_prompt = ChatPromptTemplate.from_messages([
    ("system", template),
    ("human", human_template),
])

def output_parse(ai_message):
    return ai_message.content.replace('।','').strip()
    
zeroshot_chain = chat_prompt | chat_model | output_parse

In [14]:
question = qa_df['QUESTION'][9]
zeroshot_chain.invoke({'question': question})

'Valmiki'

In [7]:
working_df = pd.read_csv('results/predictions.tsv', sep='\t')
working_df['gpt-4o-mini'] = working_df.progress_apply(lambda x: zeroshot_chain.invoke({"question": x['QUESTION']}), axis=1)

100%|███████████████████████████████████████████| 60/60 [01:05<00:00,  1.10s/it]


In [8]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)
scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.383,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-mini': 0.017}

In [60]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

## RAG prompt

In [233]:
os.environ['LANGCHAIN_TRACING_V2'] = 'false'

In [575]:
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders.text import TextLoader

# Load Documents
loader = TextLoader(file_path='data/ref/rAmAyaNa_dev.txt')
docs = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=300)
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=90)
splits = text_splitter.split_documents(docs)

In [1148]:
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, T5ForConditionalGeneration
from indic_transliteration.sanscript import IAST, DEVANAGARI, transliterate
import torch
import regex as re
import string

checkpoint = 'mahesh27/t5lemmatizer'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
lemmatizer_model = T5ForConditionalGeneration.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)

diatrics_corr = {'r'+'̣':'ṛ', 's'+'̣':'ṣ', 'r'+'̣'+'̄': 'ṝ', 't'+'̣':'ṭ', 'd'+'̣':'ḍ', 
                 'n'+'̣':'ṇ', 'l'+'̱':'ḻ', 'a'+'̄':'ā', 'i'+'̄':'ī', 'u'+'̄':'ū', 's'+'́':'ś',
                 'n'+'̇': 'ṅ', 'n'+'̃' : 'ñ',
                }

def corr_diatrics(sent):
    new_sent = ''
    sent = list(sent)
    i = 0
    while i<len(sent):
        if i+1 < len(sent):
            c2 = sent[i] + sent[i+1]
            if c2 in diatrics_corr:
                new_sent += diatrics_corr[c2]
                i += 2
                continue
        new_sent += sent[i]
        i += 1
    return new_sent
def lemmatize(chunk, translate_only=False):
        text = chunk.replace('\n\n','\n').split('\n')
        pattern = re.compile(r'[0-9]+')
        alphanumeric_pattern = re.compile(r'[a-zA-Z0-9]+')
        table = str.maketrans(dict.fromkeys(string.punctuation))  # OR {key: None for key in string.punctuation}
        lines = [alphanumeric_pattern.sub('',line) for line in text]
        lines = [pattern.sub('',transliterate(line, DEVANAGARI, IAST)) for line in lines]
        lines = [line.translate(table) for line in lines]
        if translate_only:
            return ' '.join(lines)
        tokenized_text = [{'input_ids': tokenizer(line)['input_ids'] + [tokenizer.eos_token_id]} for line in lines]
    
        inputs = data_collator(tokenized_text)
        with torch.no_grad():
            outputs = lemmatizer_model.generate(inputs=inputs['input_ids'], max_length=64)
    
        out_txt = corr_diatrics(' '.join(tokenizer.batch_decode(outputs, skip_special_tokens=True)))
        out_txt = out_txt.split()
        pre_processed = []
        #for w in out_txt:
        #    if w not in self.stop_words:
        #        pre_processed.append(w)
        
        return ' '.join(out_txt)


In [1152]:
lemmatize("""

   "सुमेलयतु -
i. आमाम्रफलम्      1. वातहरम् श्लेष्मशुक्रवर्धनम्
ii. शुष्कामाम्र फल    2. वातहरम् पित्तप्रकोपनम् 
iii. पक्वाम्रफलम्       3. वातपित्तकृत् 
iv. वृक्षपक्वाम्रफलम्  4. कफवातजित्""")

'su lup māmra phala vāta hara śleṣman śukra vardhana śuṣka āmra phala vāta hara pitta prakopana pakva āmra phala vāta pitta kṛt vṛkṣa pakva āmra phala kapha vāta jit'

In [None]:
for split in tqdm(splits):
    split.metadata['text'] = split.page_content
    split.page_content = lemmatize(split.page_content, translate_only=False)

In [237]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma

In [86]:
#### custom Embeddings ####
import importlib
from typing import Optional, cast, List
from itertools import batched
import numpy as np
import numpy.typing as npt
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import math


class TransformerEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
            self,
            model_name: str = "mahesh27/vedicberta-base",
            cache_dir: Optional[str] = None,
    ):
        try:
            from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding

            self._torch = importlib.import_module("torch")
            self._tokenizer = AutoTokenizer.from_pretrained(model_name)
            print(self._tokenizer.pad_token_id)
            self._model = AutoModel.from_pretrained(model_name, cache_dir=cache_dir)
            self._datacollator = DataCollatorWithPadding(tokenizer=self._tokenizer, return_tensors='pt')
            self._max_length = self._model.config.max_position_embeddings
            self._eps = 1e-5
        except ImportError:
            raise ValueError(
                "The transformers and/or pytorch python package is not installed. Please install it with "
                "`pip install transformers` or `pip install torch`"
            )

    @staticmethod
    def _normalize(vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm

    def __call__(self, input: Documents) -> Embeddings:
        tokenized_text = [self._tokenizer(text, truncation=True, max_length=self._max_length) for text in input]
        inputs = self._datacollator(tokenized_text)
        with self._torch.no_grad():
            outputs = self._model(**inputs)
        embeddings = (outputs.last_hidden_state*inputs['attention_mask'].unsqueeze(-1)).sum(dim=1)/(inputs['attention_mask'].sum(dim=1).unsqueeze(1)+self._eps)  # mean pooling
        return [e.tolist() for e in self._normalize(embeddings)]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        batch_length = 4
        out = []
        for batch in tqdm(batched(texts, batch_length), total = math.ceil(len(texts)//batch_length)):
            out += self(batch)
        
        return out

    def embed_query(self, text: str) -> List[float]:
        return self([text])[0]

In [238]:
import importlib
from typing import Optional, cast, List
import numpy as np
import numpy.typing as npt
from chromadb.api.types import EmbeddingFunction, Documents, Embeddings
import fasttext as ft
from gensim.models import KeyedVectors as kv

class VectorEmbeddingFunction(EmbeddingFunction[Documents]):
    def __init__(
            self,
            model_name: str = "gl",
            cache_dir: Optional[str] = None,
    ):
        self._model_name = model_name
        if model_name == "gl":
            self._model = kv.load_word2vec_format("sa_embedding/models/glove/vectors.vec", binary=False)
            self._word_vec = lambda x: self._model[x]
        elif model_name == "ft":
            self._model = ft.load_model("sa_embedding/models/fasttext/vectors.bin")
            self._word_vec = lambda x: self._model.get_word_vector(x)
    
    
    @staticmethod
    def _normalize(vector: npt.NDArray) -> npt.NDArray:
        """Normalizes a vector to unit length using L2 norm."""
        norm = np.linalg.norm(vector)
        if norm == 0:
            return vector
        return vector / norm
        
    def sentence2vec(self, sentence: str) -> npt.NDArray:
        words = sentence.split()
        word_vectors = []
        
        for word in words:
            if self._model_name == 'ft' or word in self._model:
                word_vectors.append(self._word_vec(word))
        
        if not word_vectors:
            return np.zeros(self._model.vector_size)
        
        # Compute the average vector
        avg_vector = np.mean(word_vectors, axis=0)
        
        return avg_vector


    def __call__(self, input: Documents) -> Embeddings:
        embeddings = []
        for text in input:
            embeddings.append(self.sentence2vec(text))
        return [e.tolist() for e in self._normalize(embeddings)]

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self(texts)

    def embed_query(self, text: str) -> List[float]:
        return self([text])[0]

In [239]:
embedding = VectorEmbeddingFunction(model_name='ft')
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()

In [240]:
#### INDEXING with BM25 ####
retriever = BM25Retriever.from_documents(splits)

In [241]:
#question = qa_df['QUESTION'][26]
question = 'गवाक्षस्य सेनायाः का जातिः?'

In [242]:
docs = retriever.invoke(lemmatize(question, translate_only=False))
len(docs)

4

In [244]:
docs[0].metadata['text']

'श्रुत्वा हनूमतो वाक्यं यथावदनुपूर्वशः\nततोऽब्रवीन्महातेजा रामः सत्यपराक्रमः ६,००४।००१\n\nयां निवेदयसे लङ्कां पुरीं भीमस्य रक्षसः\nक्षिप्रमेनां वधिष्यामि सत्यमेतद्ब्रवीमि ते ६,००४।००२\n\nअस्मिन्मुहूर्ते सुग्रीव प्रयाणमभिरोचये\nयुक्तो मुहूर्तो विजयः प्राप्तो मध्यं दिवाकरः ६,००४।००३\n\nउत्तरा फल्गुनी ह्यद्य श्वस्तु हस्तेन योक्ष्यते\nअभिप्रयाम सुग्रीव सर्वानीकसमावृताः ६,००४।००४\n\nनिमित्तानि च धन्यानि यानि प्रादुर्भवन्ति मे\nनिहत्य रावणं सीतामानयिष्यामि जानकीम् ६,००४।००५\n\nउपरिष्टाद्धि नयनं स्फुरमाणमिदं मम\nविजयं समनुप्राप्तं शंसतीव मनोरथम् ६,००४।००६\n\nअग्रे यातु बलस्यास्य नीलो मार्गमवेक्षितुम्\nवृतः शतसहस्रेण वानराणां तरस्विनाम् ६,००४।००७\n\nफलमूलवता नील शीतकाननवारिणा\nपथा मधुमता चाशु सेनां सेनापते नय ६,००४।००८\n\nदूषयेयुर्दुरात्मानः पथि मूलफलोदकम्\nराक्षसाः परिरक्षेथास्तेभ्यस्त्वं नित्यमुद्यतः ६,००४।००९\n\nनिम्नेषु वनदुर्गेषु वनेषु च वनौकसः\nअभिप्लुत्याभिपश्येयुः परेषां निहतं बलम् ६,००४।०१०\n\nसागरौघनिभं भीममग्रानीकं महाबलाः\nकपिसिंहा प्रकर्षन्तु शतशोऽथ सहस्रशः ६,००४।०११\n\nगजश्च गिरि

In [245]:
#### RETRIEVAL and GENERATION ####
# Prompt 
template = """त्वया संस्कृत-भाषायाम् एव वक्तव्यम्। न तु अन्यासु भाषासु। अधः रामायण-सम्बन्धे पृष्ट-प्रश्नस्य प्रत्युत्तरं देहि। तदपि एकेनैव पदेन, यावद् लघु शक्यं तावद्, तं पुनः विवृतम् मा कुरु। अपि च यथाऽवश्यम् अधः दत्त-सन्दर्भेभ्यः सहाय्यं गृहाण। तत्तु सर्वदा साधु इति नाऽस्ति प्रतीतिः।
     सन्दर्भाः:{context}
     प्रश्नः:{question}
    """
prompt = PromptTemplate.from_template(template)
# LLM
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)
#llm = ChatAnthropic(model_name="claude-3-5-sonnet-20240620", temperature=0)
#llm = ChatVertexAI(model_name="gemini-1.5-pro", temperature=0)
#llm = ChatMistralAI(model_name="mistral-large-latest", api_key= os.environ['MISTRAL_API_KEY'], temperature=0)
#llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-405b-instruct", api_key=os.environ['FIREWORKS_API_KEY'], temperature=0, max_tokens=20)

# Post-processing
def format_docs(docs):
    return '\n\n'.join([doc.metadata['text'] for doc in docs[:2]])

# Chain
rag_chain = (
    {"context": RunnableLambda(lambda x: lemmatize(x, translate_only=True)) | retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser() 
    | RunnableLambda(lambda x: x.replace('।','').strip())
)


rag_chain.invoke(question)

'वानरः'

In [246]:
working_df['gpt-4o-RAG-restricted'] = working_df.progress_apply(lambda x: rag_chain.invoke(x['QUESTION']), axis=1)

100%|███████████████████████████████████████████| 60/60 [00:54<00:00,  1.11it/s]


In [247]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)

scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.483,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-RAG-ft': 0.567,
 'gpt-4o-RAG-gl': 0.433,
 'gpt-4o-KG-RAG-responses': 0.0,
 'gpt-4o-KG-RAG-query': 0.0,
 'gpt-4o-RAG-restricted': 0.533}

In [50]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

In [74]:
rag_chain.invoke("किं नम सरोवरः यः समन्तान् योजनं विस्तीर्य तिष्ठति?")

'सुरसा'

## KG RAG

### Curate KG

In [1132]:
dataset = 'ayurveda'

In [1133]:
nodes_df = pd.read_csv(f'data/kg/{dataset}/nodes.csv')
relationships_df = pd.read_csv(f'data/kg/{dataset}/edges.csv')

In [1134]:
relationships_df.drop(columns=['annotator', 'line_id', 'line_text', 'detail'], inplace=True)

In [1135]:
nodes_df.drop(columns=['annotator', 'line_id', 'line_text'], inplace=True)

In [1136]:
nodes_df.fillna('',inplace=True)

In [1137]:
nodes_df[':LABEL'] = nodes_df.apply(lambda x: x[':LABEL'].split('/')[0].capitalize().replace(' ','_'), axis=1)
nodes_df

Unnamed: 0,:ID,:LABEL,lemma,auto
0,4459,Substance,हरीतकी,
1,4460,Substance,अमृता,
2,4461,Substance,पूतना,
3,4462,Substance,कायस्था,
4,4463,Substance,पथ्या,
...,...,...,...,...
4677,63,,,True
4678,61,,,True
4679,60,,,True
4680,64,,,True


In [1138]:
relationships_df[':TYPE'] = relationships_df.apply(lambda x: '_'.join(x[':TYPE'].replace('-', ' ').upper().split()), axis=1)
relationships_df

Unnamed: 0,:START_ID,:TYPE,:END_ID
0,3325,IS_ABSOLUTE_QUANTITY_OF,4459
1,4460,IS_SYNONYM_OF,4459
2,4461,IS_SYNONYM_OF,4459
3,4462,IS_SYNONYM_OF,4459
4,4463,IS_SYNONYM_OF,4459
...,...,...,...
10613,4453,IS_CURED_BY,4448
10614,4453,IS_CURED_BY,4449
10615,4453,IS_CURED_BY,4450
10616,4453,IS_CURED_BY,4451


In [1139]:
nodes_df.to_csv(f'data/kg/{dataset}/nodes_processed.csv',index=False)
relationships_df.to_csv(f'data/kg/{dataset}/relationships_processed.csv',index=False)

### Load KG

In [587]:
from langchain_community.graphs import Neo4jGraph

graph = Neo4jGraph()
graph.get_schema

'Node properties:\nTown {lemma: STRING}\nKingdom {lemma: STRING}\nRiver {lemma: STRING}\nHuman {lemma: STRING}\nEpithet {lemma: STRING}\nVillage {lemma: STRING}\nRakshasa {lemma: STRING}\nProfessional {lemma: STRING}\nDeva {lemma: STRING}\nRishi {lemma: STRING}\nFamily {lemma: STRING}\nMountain {lemma: STRING}\nGana {lemma: STRING}\n\u200d\u200dquality {lemma: STRING}\nUnit {lemma: STRING}\nRoad {lemma: STRING}\nWeapon {lemma: STRING}\nShape {lemma: STRING}\nMonkey {lemma: STRING}\nKing {lemma: STRING}\nHill {lemma: STRING}\nGod {lemma: STRING}\nLinage {lemma: STRING}\nMaharshi {lemma: STRING}\nPrince {lemma: STRING}\nCapital {lemma: STRING}\nCity {lemma: STRING}\nRakshashi {lemma: STRING}\nPrajapati {lemma: STRING}\nElephant {lemma: STRING}\nForest {lemma: STRING}\nPart_of_a_sea {lemma: STRING}\nApsara {lemma: STRING}\nGoddess {lemma: STRING}\nDanava {lemma: STRING}\nCountry {lemma: STRING}\nMessenger {lemma: STRING}\nDevine_bird {lemma: STRING}\nDirection {lemma: STRING}\nAditya {lem

In [588]:
def get_schema(G, structured=False):
    schema = G.get_structured_schema

    lines = []
    
    nodes = schema['node_props']
    node_list = list(nodes.keys())
    lines.append(f"All nodes have properties:\n{{{nodes[node_list[0]][0]['property']}:{nodes[node_list[0]][0]['type']}}}")
    lines.append(f"Node labels:\n{', '.join(node_list)}")
    
    edges = schema['relationships']
    relationship_list = [e['type'] for e in edges]
    relationship_list = list(set(relationship_list))
    ontology = {key: {'start':[], 'end':[]} for key in relationship_list}
    for e in edges:
        rel = e['type']
        for n in ['start', 'end']:
            if e[n] not in ontology[rel][n]:
                ontology[rel][n].append(e[n])


    ontology_list = [f"({'|'.join([':'+n for n in ends['start']])})-[:{rel}]->({'|'.join([':'+n for n in ends['end']])})" for rel, ends in ontology.items()]
    lines.append(f"Relationships:\n{'\n'.join(ontology_list)}")
    if structured:
        return {'ontology': ontology, 'nodes': node_list}
        
    return '\n\n'.join(lines)

In [606]:
get_schema(graph, structured= True)

{'ontology': {'DID_INQUIRE_ABOUT': {'start': ['Human'], 'end': ['Human']},
  'WAS_FED_BY': {'start': ['God'], 'end': ['Devi']},
  'BURNT_THE_CARCASS_OF': {'start': ['Human'], 'end': ['Devine_bird']},
  'BELONGED_TO': {'start': ['Human', 'Residential_place'],
   'end': ['Family', 'Maharshi', 'Monkey']},
  'PROTECTION_INVOKED_FOR': {'start': ['Aditya'], 'end': ['Human']},
  'IS_SITUATED_AFTER': {'start': ['Sea'], 'end': ['Sea']},
  'ARE_CREATED_BY': {'start': ['Race', 'Military', 'Soldier'],
   'end': ['Devine_cow']},
  'IS_CONQUERED_BY': {'start': ['City'], 'end': ['Human']},
  'IS_REFERRED_BY': {'start': ['King'], 'end': ['Human']},
  'WAS_PROTECTED_BY': {'start': ['Forest'], 'end': ['Monkey']},
  'IS_PLACED_BY': {'start': ['Human'], 'end': ['Rakshashi']},
  'ARE_APPOINTED_BY': {'start': ['Devi'], 'end': ['God']},
  'IS_MATERNAL_UNCLE_OF': {'start': ['Human', 'Rakshasa', 'Monkey'],
   'end': ['Human', 'Rakshasa', 'Monkey']},
  'IS_MESSENGER_OF': {'start': ['Human'], 'end': ['Maharshi']

### Entity Extraction

In [1076]:
from typing import List, Optional

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from SktQA.utils import corr_diatrics

llm = ChatOpenAI(model="gpt-4o", temperature=0)
#llm = ChatAnthropic(model="claude-3-5-sonnet-20240620", temperature=0)
#llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-405b-instruct", api_key=os.environ['FIREWORKS_API_KEY'], temperature=0)
#llm = ChatVertexAI(model_name="gemini-1.5-pro", temperature=0)
#llm = ChatMistralAI(model_name="mistral-large-latest", api_key= os.environ['MISTRAL_API_KEY'], temperature=0)

class Entities(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the named entities appearing in the text",
    )
    scores: List[float] = Field(
        ...,
        description="Relevance scores of the named entities appearing in the text",
    )

    paths: List[List[str]] = Field(
        ...,
        description="Previous route"
    )



topic_entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """त्वम् knowledge-graph-तः उत्तराणि निष्कर्षयितुं प्रश्नात् entities विन्दसि च तानि सह relevance-score (0-1 मध्ये) समर्पयसि ।
            output उदाहरणम् ('रामः', 0.8), ('सीता', 0.7) । ततो विवृतं मा कुरु । """,
        ),
        (
            "human",
            "प्रश्नः: {question}",
        ),
    ]
)

def entity_lemmas(entities: Entities, lemmatized=False):
    length = len(entities.names)
    entities.names = [''.join(x.split()) for x in entities.names]

    if not lemmatized:
        lemmatized_names = transliterate(corr_diatrics(lemmatize(' '.join(entities.names)).replace('\n',' ')), IAST, DEVANAGARI).split()
        if len(lemmatized_names) != length:
            lemmatized = lemmatize('\n'.join(entities.names)).split('\n')
            lemmatized_names = transliterate(corr_diatrics(' '.join([''.join(x.split()) for x in lemmatized])), IAST, DEVANAGARI).split()

        entities.names = lemmatized_names
    entities.paths = [['']]*length
    return entities
    
topic_entity_chain = topic_entity_prompt | llm.with_structured_output(Entities) | RunnableLambda(entity_lemmas)

In [1077]:
question = qa_df['QUESTION'][0]
#question = 'कस्याः नद्यः तिरे पञ्चवटी अस्ति यत्र सह पत्न्या सह भ्रात्रा च रामोऽवसत्?'
question

'यस्मिन् विमाने उपाविश्य रामः स्वपत्न्या भ्रात्रा च सह अयोध्यां प्रत्याजगाम तस्य विमानस्य किं नाम?'

In [1078]:
entities = topic_entity_chain.invoke({"question": question})
entities

Entities(names=['राम', 'स्वपत्नी', 'भ्रातृ', 'अयोध्या', 'विमान'], scores=[0.8, 0.6, 0.6, 0.7, 0.9], paths=[[''], [''], [''], [''], ['']])

In [1079]:
def prune(items, th=3):
    size = len(items.names)

    if size == 0:
        return items
    lim = min(size, th)
    names_tuple = [(items.names[i], items.scores[i]) for i in range(size)]
    names_tuple.sort(key=lambda x: x[1], reverse=True)
    items.names = [names_tuple[i][0] for i in range(lim)]
    
    if 'paths' in vars(items):
        paths_tuple = [(items.paths[i], items.scores[i]) for i in range(size)]
        paths_tuple.sort(key=lambda x: x[1], reverse=True)
        items.paths = [paths_tuple[i][0] for i in range(lim)]

    items.scores = [names_tuple[i][1] for i in range(lim)]
    return items

entities = prune(entities)

In [1080]:
label_query = """MATCH (p)
WHERE p.lemma = $value
RETURN p.lemma AS result, labels(p)[0] AS type
"""

def map_entities_to_database(entities: Entities) -> Optional[str]:
    result = ""
    for entity in entities.names:
        responses = graph.query(label_query, {"value": entity})
        for response in responses:
            result += f"{entity} knowledge-base-अन्तः भवति :{response['type']} {{'lemma': {response['result']}}} इति । अस्य सम्बन्धानि (relationships) अधः वर्तन्ते\n"
            mapped_entity = {'type': response['type'], 'lemma': response['result']}

            edge_outward = f"""MATCH (p:{mapped_entity['type']})-[r]->(q)
            WHERE p.lemma = '{mapped_entity['lemma']}'
            RETURN r AS relationship, labels(q)[0] AS type, q.lemma AS lemma
            """
            
            edge_inward =f"""MATCH (p:{mapped_entity['type']})<-[r]-(q)
            WHERE p.lemma = '{mapped_entity['lemma']}'
            RETURN r AS relationship, labels(q)[0] AS type, q.lemma AS lemma
            """

            def prune_edges(edges):
                out = {}
                for edge in edges:
                    rel = edge['relationship'][1]
                    if rel not in out:
                        out[rel] = []
                    edge_rep = f"{edge['type']}"# {{'lemma': {edge['lemma']}}}"
                    if edge_rep not in out[rel]:
                        out[rel].append(edge_rep)
                return out
                
            outward_edges = graph.query(edge_outward)
            result += f"अस्मात् बहिः गच्छन्ति (outgoing) सम्बन्धानि -\n"
            for e, t in prune_edges(outward_edges).items():
                result += f"-[:{e}]->({'|'.join([':'+l for l in t])})\n"
                
            inward_edges = graph.query(edge_inward)
            result += f"अस्मात् अन्तः गच्छन्ति (incoming) सम्बन्धानि -\n"
            for e, t in prune_edges(inward_edges).items():
                result += f"<-[:{e}]-({'|'.join([':'+l for l in t])})\n"

            result += '\n'
    
    return result

mapping = map_entities_to_database(entities)
print(mapping, len(mapping))

राम knowledge-base-अन्तः भवति :Human {'lemma': राम} इति । अस्य सम्बन्धानि (relationships) अधः वर्तन्ते
अस्मात् बहिः गच्छन्ति (outgoing) सम्बन्धानि -
-[:WENT_TO]->(:Town)
-[:RETURNED_TO]->(:Town)
-[:HAS_TRAVELLED_TO]->(:Town)
-[:WAS_APPROACHED_TO_BE_THE_KING_BY]->(:Human)
-[:FORMED_AN_ACQUAINTANCE_WITH]->(:Monkey)
-[:BURNT_THE_CARCASS_OF]->(:Devine_bird)
-[:DID_INQUIRE_ABOUT]->(:Human)
-[:IS_DIRECTED_BY]->(:Rakshasa)
-[:IS_DIRECTED_TO]->(:Tribe)
-[:WAS_ATTACKED_BY]->(:Rakshasa)
-[:DID_INQUIRE_TO]->(:River)
-[:SET_UP_HIS_ABODE_AT]->(:Hill)
-[:HAS_TRAVELLED_BY]->(:Vimana)
-[:VISITED_THE_TEMPLE_OF]->(:Deva|:God)
-[:IS_VISITED_BY]->(:Human)
-[:FORMED_AN_ACQUAINTANCE_AT_THE_BANK_OF]->(:Lake)
-[:HAS_DESCRIBED_THE_BEAUTY_OF]->(:Hill)
अस्मात् अन्तः गच्छन्ति (incoming) सम्बन्धानि -
<-[:IS_FATHER_OF]-(:Epithet)
<-[:IS_MOTHER_OF]-(:Human)
<-[:MET_WITH]-(:Tribe)
<-[:IS_KILLED_BY]-(:Rakshasa)
<-[:IS_VISITED_BY]-(:Temple|:Forest|:Hill)
<-[:IS_THE_DIRECTOR_TO_POSSES_DIVINE_WEAPONS]-(:Rishi)
<-[:HIS_AS

### Relationships pruning

In [1081]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

class Relationships(BaseModel):
    """Identifying information about entities."""

    names: List[str] = Field(
        ...,
        description="All the relationships"
    ) 
    
    
    scores: List[float] = Field(
        ...,
        description="Relevance scores of the relationships",
    )



relation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """त्वम् दत्त-प्रश्नस्य उत्तराणि knowledge-graph-तः निष्कर्षितुं knowledge-graph-तः इदानीं पर्यन्तं निष्कर्षित-सम्बन्धेभ्यः अवश्यानि सम्बन्धानि सह relevance-score (0-1 मध्ये) समर्पयसि ।
            output उदाहरणम् ('IS_FATHER_OF', 0.8), ('IS_CROSSED_BY', 0.7), ... । ततो विवृतं मा कुरु । """,
        ),
        (
            "human",
            """प्रश्नः: {question}
            निष्कर्षितानि सम्बन्धानि: {relations}""",
        ),
    ]
)


relation_chain = relation_prompt | llm.with_structured_output(Relationships)

In [1082]:
relations = relation_chain.invoke({"question": question, "relations": mapping})
relations = prune(relations)
print(relations)

names=['HAS_TRAVELLED_BY', 'RETURNED_TO', 'IS_FATHER_OF'] scores=[1.0, 0.9, 0.8]


In [1083]:
label_query = """MATCH (p)
WHERE p.lemma = $value
RETURN p.lemma AS result, labels(p)[0] AS type
"""

def map_relations_to_database(relations: Relationships, entities: Entities) -> Optional[str]:
    result_txt = ""
    track_paths = {}
    for entity, path in zip(entities.names, entities.paths):
        responses = graph.query(label_query, {"value": entity})
        
        for response in responses:
            result_txt += f"(:{response['type']} {{'lemma': {response['result']}}}) इति अस्य सम्बन्धानि (relationships) अधः वर्तन्ते\n"
            mapped_entity = {'type': response['type'], 'lemma': response['result']}
            for relation in relations.names:
    
                edge_outward = f"""MATCH (p:{mapped_entity['type']})-[:{relation}]->(q)
                WHERE p.lemma = '{mapped_entity['lemma']}'
                RETURN labels(q)[0] AS type, q.lemma AS lemma
                """
                
                edge_inward =f"""MATCH (p:{mapped_entity['type']})<-[:{relation}]-(q)
                WHERE p.lemma = '{mapped_entity['lemma']}'
                RETURN labels(q)[0] AS type, q.lemma AS lemma
                """
                    
                outward_edges = graph.query(edge_outward)
                ends = [f":{edge['type']} {{'lemma': {edge['lemma']}}}" for edge in outward_edges]

                for edge in outward_edges:
                    if edge['lemma'] not in track_paths:
                        track_paths[edge['lemma']] = []
                    track_paths[edge['lemma']].extend([f"{p}(:{response['type']} {{'lemma': {response['result']}}})-[:{relation}]->" for p in path])
            
                if len(ends) > 0: result_txt += f"-[:{relation}]->({'|'.join(ends)})\n"
                    
                inward_edges = graph.query(edge_inward)
                ends = [f":{edge['type']} {{'lemma': {edge['lemma']}}}" for edge in inward_edges]

                for edge in inward_edges:
                    if edge['lemma'] not in track_paths:
                        track_paths[edge['lemma']] = []
                    track_paths[edge['lemma']].extend([f"{p}(:{response['type']} {{'lemma': {response['result']}}})<-[:{relation}]-" for p in path])
                
                if len(ends) > 0: result_txt += f"<-[:{relation}]-({'|'.join(ends)})\n"
    
            result_txt += '\n'
        
    return result_txt, track_paths

mapping_relations, track_paths = map_relations_to_database(relations, entities)
print(mapping_relations, len(mapping_relations))
print(track_paths)

(:Human {'lemma': राम}) इति अस्य सम्बन्धानि (relationships) अधः वर्तन्ते
-[:HAS_TRAVELLED_BY]->(:Vimana {'lemma': पुष्पक})
-[:RETURNED_TO]->(:Town {'lemma': अयोध्या})
<-[:IS_FATHER_OF]-(:Epithet {'lemma': दशरथ})

(:Town {'lemma': अयोध्या}) इति अस्य सम्बन्धानि (relationships) अधः वर्तन्ते
<-[:RETURNED_TO]-(:Human {'lemma': राम})

 331
{'पुष्पक': ["(:Human {'lemma': राम})-[:HAS_TRAVELLED_BY]->"], 'अयोध्या': ["(:Human {'lemma': राम})-[:RETURNED_TO]->"], 'दशरथ': ["(:Human {'lemma': राम})<-[:IS_FATHER_OF]-"], 'राम': ["(:Town {'lemma': अयोध्या})<-[:RETURNED_TO]-"]}


### Entity chain

In [1084]:
entity_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """त्वम् दत्त-प्रश्नस्य उत्तराणि knowledge-graph-तः निष्कर्षितुं knowledge-graph-तः इदानीं पर्यन्तं निष्कर्षित-सम्बन्धेभ्यः अवश्यानि nodes (lemmas) सह relevance-score (0-1 मध्ये) समर्पयसि ।
            output उदाहरणम् ('राम', 0.8), ('सीता', 0.7) । ततो विवृतं मा कुरु । """,
        ),
        (
            "human",
            """प्रश्नः: {question}
            निष्कर्षितानि सम्बन्धानि: {relations}""",
        ),
    ]
)


entity_chain = entity_prompt | llm.with_structured_output(Entities) | RunnableLambda(lambda x: entity_lemmas(x, lemmatized=True))

In [1085]:
def append_paths(entities, track_paths):
    new_names = []
    new_scores = []
    new_paths = []
    
    for e, s in zip(entities.names, entities.scores):
        lemma = e.strip()
        if lemma in track_paths:
            new_names.append(lemma)
            new_scores.append(s)
            new_paths.append(track_paths[lemma])
    entities.names = new_names
    entities.scores = new_scores
    entities.paths = new_paths

    return entities

In [1086]:
entities = entity_chain.invoke({"question": question, "relations": mapping_relations})
entities = append_paths(entities, track_paths)
entities = prune(entities)
print(entities)

names=['पुष्पक', 'राम', 'अयोध्या'] scores=[0.9, 0.8, 0.7] paths=[["(:Human {'lemma': राम})-[:HAS_TRAVELLED_BY]->"], ["(:Town {'lemma': अयोध्या})<-[:RETURNED_TO]-"], ["(:Human {'lemma': राम})-[:RETURNED_TO]->"]]


### Reasoning

In [1087]:
reasoning_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """त्वम् दत्त-प्रश्नस्य उत्तराणि knowledge-graph-तः निष्कर्षितुं knowledge-graph-तः इदानीं पर्यन्तं निष्कर्षितं यत्-किञ्चिद् प्रश्नस्य उत्तरं दातुं अलम् (1) वा नालम् (0) इति वक्तव्यम्।
               output 1 अथवा 0 । न अन्यत् वदसि""",
        ),
        (
            "human",
            """प्रश्नः: {question}
            निष्कर्षितम्: {path_info}""",
        ),
    ]
)

def path_info_format(inp):
    path_info = []
    for e, p in zip(inp['entities'].names, inp['entities'].paths):
        for p0 in p:
            path_info.append(f"{p0}({{'lemma': {e}}})")

    inp['path_info'] = '\n'.join(path_info)
    return inp

reasoning_chain = RunnableLambda(path_info_format) | reasoning_prompt | llm

In [1088]:
decision = reasoning_chain.invoke({'question': question, 'entities': entities})
print(decision.content)

1


## Answering

In [1097]:
answer_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """अधः रामायण-सम्बन्धे पृष्ट-प्रश्नस्य प्रत्युत्तरं देहि। तदपि प्रश्नोचितविभक्तौ भवेत् न तु प्रातिपदिक रूपे । तदपि एकेनैव पदेन यदि उत्तरे कारणं नापेक्षितम्। कथम् किमर्थम् इत्यादिषु एकेन लघु वाक्येन उत्तरं देहि अत्र तु एक-पद-नियमः नास्ति। 
            अपि च यथाऽवश्यम् अधः दत्तैः knowledge-graph-तः निष्कर्षित-विषयैः सहाय्यं गृहाण। तत्तु सर्वदा साधु इति नाऽस्ति प्रतीतिः।""",
        ),
        (
            "human",
            """प्रश्नः: {question}
            निष्कर्षितम्: {path_info}""",
        ),
    ]
)

answer_chain = RunnableLambda(path_info_format) | answer_prompt | llm | StrOutputParser() | RunnableLambda(lambda x: x.replace('।','').strip())


In [1098]:
answer = answer_chain.invoke({'question': question, 'entities': entities})
print(answer)

पुष्पकम्


In [558]:
working_df = pd.read_csv('results/predictions.tsv', sep='\t')

In [559]:
working_df['gpt-4o-KG-RAG-responses'] = working_df.progress_apply(lambda x: kg_rag_answer(x['QUESTION']), axis=1)

100%|███████████████████████████████████████████| 60/60 [03:42<00:00,  3.70s/it]


In [560]:
models = working_df.columns
scores = {m: 0 for m in models}

for m in models:
    em = working_df.apply(lambda x: str(x['ANSWER']).strip() == str(x[m]).strip(), axis=1)
    scores[m] = round(em.sum()/len(em), 3)

scores

{'QUESTION': 0.0,
 'ANSWER': 1.0,
 'gpt-4o': 0.483,
 'gpt-4': 0.517,
 'gpt-3.5-turbo': 0.0,
 'gpt-4o-RAG': 0.55,
 'gpt-4o-KG-RAG': 0.483,
 'claude-3-5-sonnet': 0.717,
 'claude-3.5-sonnet-RAG': 0.6,
 'gemini-1.0-pro': 0.183,
 'gemini-1.5-flash': 0.267,
 'gemini-1.5-pro': 0.567,
 'mistral-large': 0.583,
 'llama-3.1-405B-instruct': 0.433,
 'mistral-large-RAG': 0.567,
 'llama-3.1-80b-i-KG-RAG': 0.2,
 'gpt-4o-RAG-ft': 0.567,
 'gpt-4o-RAG-gl': 0.433,
 'gpt-4o-KG-RAG-responses': 0.133,
 'gpt-4o-KG-RAG-query': 0.0}

In [561]:
working_df.to_csv('results/predictions.tsv', index=False, sep='\t')

In [192]:
working_df['gpt-4o-KG-RAG'][0]

'पुष्पके'

### Evaluating generated cypher query quality

In [562]:
cypher_queries = working_df['gpt-4o-KG-RAG-query']

In [563]:
query_text = cypher_queries[0]

In [564]:
import regex as re

edge_forward = re.compile(r"\((.+?)\)\-\[:(\S+?)\]\->\((.+?)\),?")
edge_backward = re.compile(r"\((.+?)\)<\-\[:(\S+?)\]\-\((.+?)\),?")
label_pattern0 = re.compile(r"\S*?:(.+)")
label_pattern1 = re.compile(r"(\S+)\s*\{lemma:\s*(.+)\}")

In [565]:
match = edge_backward.findall(query_text)

In [566]:
query_text

"```cypher\nMATCH (राम:Human {lemma: 'राम'})-[:RETURNED_TO]->(अयोध्या:Town {lemma: 'अयोध्या'})\nRETURN राम.lemma AS राम, अयोध्या.lemma AS अयोध्या\n```"

In [567]:
match

[]

In [568]:
def label_parse(node):
    if ':' not in node:
        return {'label': '', 'lemma': ''}
    outer_match = label_pattern0.findall(node)[0]

    if 'lemma' not in outer_match:
        return {'label': outer_match, 'lemma': ''}

    label, lemma = label_pattern1.findall(outer_match)[0]
    return {'label': label, 'lemma': lemma.replace("'",'')}

In [569]:
relationships = []
nodes = []

for query in tqdm(cypher_queries):
    match_forward = edge_forward.findall(query)
    match_backward = edge_backward.findall(query)

    for match in match_forward:
        start = label_parse(match[0])
        end = label_parse(match[2])

        relationships.append({'start': start['label'], 'relationship': match[1], 'end': end['label']})
        if start['label'] != '' and start['lemma'] != '':
            nodes.append(start)
        if end['label'] != '' and end['lemma'] != '':
            nodes.append(end)

    for match in match_backward:
        start = label_parse(match[2])
        end = label_parse(match[0])

        relationships.append({'start': start['label'], 'relationship': match[1], 'end': end['label']})
        if start['label'] != '' and start['lemma'] != '':
            nodes.append(start)
        if end['label'] != '' and end['lemma'] != '':
            nodes.append(end)

100%|████████████████████████████████████████| 60/60 [00:00<00:00, 94254.02it/s]


In [570]:
### Verify nodes
valid_label = []
label_lemma = []
valid_lemma = []

schema = get_schema(graph, structured=True)
for node in nodes:
    if node['label'] in schema['nodes']:
        valid_label.append(1)
    else:
        valid_label.append(0)

    if node['lemma'] != '':
        lemma_match = graph.query(f"""MATCH (p)
                        WHERE p.lemma CONTAINS '{node['lemma']}'
                        RETURN p.lemma AS result, labels(p)[0] AS type
                        LIMIT 1
                        """)
        if len(lemma_match) == 0:
            valid_lemma.append(0)
        else:
            valid_lemma.append(1)
            if lemma_match[0]['type'] == node['label']:
                label_lemma.append(1)
            else:
                label_lemma.append(0)

valid_ontology = []
valid_relationship = []
valid_start_type = []
valid_end_type = []
for edge in relationships:
    rel, start, end = edge['relationship'], edge['start'], edge['end']
    valid_ontology.append((rel in list(schema['ontology'].keys())) and 
                          (start=='' or start in schema['ontology'][rel]['start']) and
                          (end=='' or end in schema['ontology'][rel]['end']))
    valid_relationship.append(rel in list(schema['ontology'].keys()))

    if rel in list(schema['ontology'].keys()):
        valid_start_type.append(start=='' or start in schema['ontology'][rel]['start'])
        valid_end_type.append(end=='' or end in schema['ontology'][rel]['end'])

results = {'valid_label': np.mean(valid_label), 'valid_lemma': np.mean(valid_lemma), 
           'valid_label_lemma': np.mean(label_lemma), 'valid_ontology': np.mean(valid_ontology),
           'valid_relationship': np.mean(valid_relationship), 'valid_start_type': np.mean(valid_start_type),
           'valid_end_type': np.mean(valid_end_type)}

for key, val in results.items():
    results[key] = round(val, 2)

results

{'valid_label': 0.0,
 'valid_lemma': 0.78,
 'valid_label_lemma': 0.94,
 'valid_ontology': 0.02,
 'valid_relationship': 0.69,
 'valid_start_type': 0.64,
 'valid_end_type': 0.02}

In [571]:
edge

{'start': '', 'relationship': 'IS_FATHER_OF', 'end': 'Human'}