In [19]:
# Select LLM for RAG
model_select= "llama3.1" # ollama models: "llama3:70B","llama3.1" open AI models: "gpt-4o","gpt-4o-mini"

# Set up enviroment and functions

import os
import glob
import rdflib
import concurrent.futures

# Define a function to read and parse a single JSON-LD file into an RDFLib graph
def read_and_parse_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_content = file.read()
        graph = rdflib.Graph()
        graph.parse(data=file_content, format='json-ld')
    return graph

# Define a function to load JSON-LD files into an RDFLib graph using concurrent processing
def load_jsonld_files_to_graph(folder_path):
    # Get all JSON-LD files in the specified folder
    jsonld_files = glob.glob(os.path.join(folder_path, "*.json"))

    # Create an empty RDFLib graph
    graph = rdflib.Graph()

    # Use concurrent processing to read and parse files in parallel
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Map the read_and_parse_file function to each file path
        results = executor.map(read_and_parse_file, jsonld_files)

    # Combine all graphs into one
    for g in results:
        graph += g

    return graph


import requests

# Load the context file from the URL
context_url = "https://w3id.org/emmo/domain/battery/context"
context_data = requests.get(context_url).json()

def add_Iri_context(query,context_data):

    # Extract IRIs from the context file
    hasPositiveElectrode = rdflib.URIRef(context_data["@context"]["hasPositiveElectrode"]["@id"])
    PositiveElectrode = rdflib.URIRef(context_data["@context"]["PositiveElectrode"])
    hasActiveMaterial = rdflib.URIRef(context_data["@context"]["hasActiveMaterial"]["@id"])
    NMC = rdflib.URIRef(context_data["@context"]["LithiumNickelManganeseCobaltOxide"])
    LFP = rdflib.URIRef(context_data["@context"]["LithiumIronPhosphate"])
    hasProperty = rdflib.URIRef(context_data["@context"]["hasProperty"]["@id"])
    hasNumericalPart = rdflib.URIRef(context_data["@context"]["hasNumericalPart"]["@id"])
    hasNumericalValue = rdflib.URIRef(context_data["@context"]["hasNumericalValue"])
    RatedCapacity = rdflib.URIRef(context_data["@context"]["RatedCapacity"])
    CycleLife = rdflib.URIRef(context_data["@context"]["CycleLife"])
    NominalVoltage = rdflib.URIRef(context_data["@context"]["NominalVoltage"])
    UpperVoltageLimit = rdflib.URIRef(context_data["@context"]["UpperVoltageLimit"])
    LowerVoltageLimit = rdflib.URIRef(context_data["@context"]["LowerVoltageLimit"])
    DischargingCurrent = rdflib.URIRef(context_data["@context"]["DischargingCurrent"])
    #MaximumContinuousCurrentDischarging = rdflib.URIRef(context_data["@context"]["MaximumContinuousCurrentDischarging"])
    Mass = rdflib.URIRef(context_data["@context"]["Mass"])
    ChargingCurrent = rdflib.URIRef(context_data["@context"]["ChargingCurrent"])
    Height = rdflib.URIRef(context_data["@context"]["Height"])
    Diameter = rdflib.URIRef(context_data["@context"]["Diameter"])

    return query.format(
    hasPositiveElectrode=str(hasPositiveElectrode),
    PositiveElectrode=str(PositiveElectrode),
    hasActiveMaterial=str(hasActiveMaterial),
    NMC=str(NMC),
    LFP=str(LFP),
    hasProperty=str(hasProperty),
    hasNumericalPart=str(hasNumericalPart),
    hasNumericalValue=str(hasNumericalValue),
    RatedCapacity=str(RatedCapacity),
    CycleLife=str(CycleLife),
    NominalVoltage=str(NominalVoltage),
    UpperVoltageLimit=str(UpperVoltageLimit),
    LowerVoltageLimit=str(LowerVoltageLimit),
    DischargingCurrent=str(DischargingCurrent),
    #MaximumContinuousCurrentDischarging=str(MaximumContinuousCurrentDischarging),
    Mass=str(Mass),
    ChargingCurrent=str(ChargingCurrent),
    Height=str(Height),
    Diameter=str(Diameter))

from openai import OpenAI
import ollama
import re
import openai
import json
 
openai.api_key = os.environ["OPENAI_API_KEY"]

client = OpenAI()

def generate_sparql_query_openai(question,model_select):
    prepromt="""You are an intelligent assistant with knowledge of SPARQL and RDF graphs. Generate a SPARQL query to answer the following question based on the RDF graph schema. Return only the query and nothing else. You are also provided the context of the keywords in order to generate the right query. Relevant Keywords and their Explanations:
    - schema:name: The name of the entity.
    - schema:manufacturer: The manufacturer of the battery.
    - schema:subjectOf: instances of the battery being cited in literature
    - hasPositiveElectrode: The positive electrode of the battery.
    - hasActiveMaterial: The active material used in the electrode.
    - hasCase: The case type of the battery.
    - hasProperty: Various properties of the battery.

    Properties:
    - RatedCapacity: The rated capacity of the battery.
    - CycleLife: The cycle life of the battery.
    - NominalVoltage: The nominal voltage of the battery.
    - UpperVoltageLimit: The upper voltage limit of the battery.
    - LowerVoltageLimit: The lower voltage limit of the battery.
    - DischargeCurrent: The discharge current of the battery.
    - MaximumContinuousDischargeCurrent: The maximum continuous discharge current of the battery.
    - Mass: The mass of the battery.
    - ChargingCurrent: The charging current of the battery.
    - Height: The height of the battery.
    - Diameter: The diameter of the battery.

    These properties are loaded beforehand with rdflib.URIRef so be sure to put them in brackets like <{hasNumericalPart}>. Also make sure to have double {{ }} with the WHERE statement.

    Example Questions and SPARQL Queries:
    1. Question: What is the nominal voltage of the INR21700 M50 battery?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?value
    WHERE {{
    ?thing schema:name "INR21700 M50T" .
    ?thing <{hasProperty}> ?property .
    ?property a <{NominalVoltage}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
    }}

    2. Question: Who is the manufacturer of the INR21700 M50 battery?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?manufacturerName
    WHERE {{
        ?thing schema:name "INR21700 M50" .
        ?thing schema:manufacturer ?manufacturer.
        ?manufacturer schema:name ?manufacturerName.
    }}

    3.  Question: What is the capacity of the INR21700 M50 battery from LG Chem?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?value
    WHERE {{
    ?thing schema:name "INR21700 M50T" .
    ?thing schema:manufacturer ?manufacturer.
    ?manufacturer schema:name "LG Chem".
    ?thing <{hasProperty}> ?property .
    ?property a <{RatedCapacity}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
    }}

    question = """    
    response = client.chat.completions.create(
        model=model_select,
        messages=[
            {"role": "user", "content": prepromt + question},
        ]
    )
    return response.choices[0].message.content.strip()  

def generate_sparql_query_ollama(question,model_select):
    prepromt="""You are an intelligent assistant with knowledge of SPARQL and RDF graphs. Generate a SPARQL query to answer the following question based on the RDF graph schema. Return only the query and nothing else. You are also provided the context of the keywords in order to generate the right query. Relevant Keywords and their Explanations:
    - schema:name: The name of the entity.
    - schema:manufacturer: The manufacturer of the battery.
    - schema:subjectOf: instances of the battery being cited in literature
    - hasPositiveElectrode: The positive electrode of the battery.
    - hasActiveMaterial: The active material used in the electrode.
    - hasCase: The case type of the battery.
    - hasProperty: Various properties of the battery.

    Properties:
    - RatedCapacity: The rated capacity of the battery.
    - CycleLife: The cycle life of the battery.
    - NominalVoltage: The nominal voltage of the battery.
    - UpperVoltageLimit: The upper voltage limit of the battery.
    - LowerVoltageLimit: The lower voltage limit of the battery.
    - DischargeCurrent: The discharge current of the battery.
    - MaximumContinuousDischargeCurrent: The maximum continuous discharge current of the battery.
    - Mass: The mass of the battery.
    - ChargingCurrent: The charging current of the battery.
    - Height: The height of the battery.
    - Diameter: The diameter of the battery.

    These properties are loaded beforehand with rdflib.URIRef so be sure to put them in brackets like <{hasNumericalPart}>. Also make sure to have double {{ }} with the WHERE statement.

    Example Questions and SPARQL Queries:
    1. Question: What is the nominal voltage of the INR21700 M50 battery?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?value
    WHERE {{
    ?thing schema:name "INR21700 M50T" .
    ?thing <{hasProperty}> ?property .
    ?property a <{NominalVoltage}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
    }}

    2. Question: Who is the manufacturer of the INR21700 M50 battery?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?manufacturerName
    WHERE {{
        ?thing schema:name "INR21700 M50" .
        ?thing schema:manufacturer ?manufacturer.
        ?manufacturer schema:name ?manufacturerName.
    }}

    3.  Question: What is the capacity of the INR21700 M50 battery from LG Chem?
    SPARQL Query:
    PREFIX schema: <https://schema.org/>
    SELECT ?value
    WHERE {{
    ?thing schema:name "INR21700 M50T" .
    ?thing schema:manufacturer ?manufacturer.
    ?manufacturer schema:name "LG Chem".
    ?thing <{hasProperty}> ?property .
    ?property a <{RatedCapacity}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
    }}
    
    question = """
    response = ollama.chat(
        model=model_select,
        messages=[
            {"role": "user", "content": prepromt + question},
        ]
    )
    query = response['message']['content']
    match = re.search(r'PREFIX', query)
    if match:
        return query[match.start():]
    return 



def generate_final_response_openai(question, sparql_result,model_select):
    response = client.chat.completions.create(
        model=model_select,
        messages=[
            {"role": "system", "content": "You are a helpful assistant with access to SPARQL query results from an RDF graph. Use the query results to answer the following question and do not make up additional information or context."},
            {"role": "user", "content": f"Question: {question}\n\nSPARQL Query Result: {sparql_result}"},
        ])
    return response.choices[0].message.content.strip()

def generate_final_response_ollama(question, sparql_result,model_select):
    response = ollama.chat(
        model=model_select,
        messages=[
            {"role": "system", "content": "You are a helpful assistant with access to SPARQL query results from an RDF graph. Use the query results to answer the following question and do not make up additional information or context."},
            {"role": "user", "content": f"Question: {question}\n\nSPARQL Query Result: {sparql_result}"},
        ])
    return  response['message']['content']

def get_answer_RAG(question,model_select):
    # Step 1: Generate SPARQL query using OpenAI
    if model_select in ["gpt-4o","gpt-4o-mini"]:
        sparql_query = generate_sparql_query_openai(question,model_select).replace("```sparql\n","").replace("\n```","").replace("```","").replace("```sparql:\n","")
    elif model_select in ["llama3.1","llama3:70B","codestral"]:
        sparql_query = generate_sparql_query_ollama(question,model_select).replace("```sparql\n","").replace("\n```","").replace("```","").replace("```sparql:\n","")
    else:
        print('Model unknown')
        return

    print('Using query:')
    print(sparql_query + '\n')
    # Step 2: Query the RDF graph
    try:
        qres = graph.query(add_Iri_context(sparql_query,context_data))
        # results = query_graph(g, sparql_query)
        sparql_result = []
        for row in qres:
            sparql_result.append({str(var): str(row[var]) for var in row.labels})
        
        # # Convert SPARQL results to a string
        sparql_result_str = json.dumps(sparql_result, indent=2)
    except:
        sparql_result_str = []
    
    print('Sparql answer:')
    print(sparql_result_str)
    print('\n')
    # # Step 3: Generate final response using OpenAI with the SPARQL query results
    if model_select in ["gpt-4o","gpt-4o-mini"]:
        final_response = generate_final_response_openai(question, sparql_result_str,model_select)
    elif model_select in ["llama3.1","llama3:70B","codestral"]:
        final_response = generate_final_response_ollama(question, sparql_result_str,model_select)
    else:
        print('Model unknown')
        return

    return final_response


In [2]:
# Create the knowledge Graph in RDFLIB to be queried
folder_path = 'BatteryTypeJson'

# Load JSON-LD files into the graph
graph = load_jsonld_files_to_graph(folder_path)

In [18]:
sparql_query="""
    PREFIX schema: <https://schema.org/>
    SELECT ?value
    WHERE {{
    ?thing schema:name "INR21700 M50T" .
    ?thing schema:manufacturer ?manufacturer.
    ?manufacturer schema:name "LG Chem".
    ?thing <{hasProperty}> ?property .
    ?property a <{RatedCapacity}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
    }}
"""
qres = graph.query(add_Iri_context(sparql_query,context_data))

for row in qres:
    print(row.value.value)

4.8


In [20]:
#Ask the RAG a question. LLM will first query the knowledge graph and then form answer with the results
#model_select= "llama3.1" # ollama models: "llama3:70B","llama3.1" open AI models: "gpt-4o","gpt-4o-mini"

model_select="gpt-4o-mini"
#question = "What is the rated capacity of the INR21700 M50 battery?"
#question = "What papers are using the INR21700 M50 battery?"
question = "What is the rated capacity of the A123 20AH battery from A123?"
#question = "Which batteries have a rated capacity between 3 and 4 Ah? Get the name and manufacturer"


print(get_answer_RAG(question,model_select))

Using query:
PREFIX schema: <https://schema.org/>
SELECT ?value
WHERE {{
    ?thing schema:name "A123 20AH" .
    ?thing schema:manufacturer ?manufacturer .
    ?manufacturer schema:name "A123" .
    ?thing <{hasProperty}> ?property .
    ?property a <{RatedCapacity}> ;
                <{hasNumericalPart}> ?numericalPart .
    ?numericalPart <{hasNumericalValue}> ?value .
}}

Sparql answer:
[
  {
    "value": "20.0"
  }
]


The rated capacity of the A123 20AH battery from A123 is 20.0 AH (amp-hours).


In [4]:
# sparql_query=""" 
# PREFIX schema: <https://schema.org/>
# SELECT ?Name
# WHERE {{
#     ?thing schema:name ?Name.
# }}
# """
# qres = graph.query(add_Iri_context(sparql_query,context_data))

# ManufacturerList=[]
# for row in qres:
#     ManufacturerList.append(row.Name.value)
#     #print(row.manufacturerName)

# ManufacturerList = list(set(ManufacturerList))
# print(ManufacturerList)

In [13]:
import pandas as pd

question_df=pd.read_csv('questions.csv')
LLM_results=question_df

In [14]:
model_select="llama3.1"
#model_select="gpt-4o-mini"
#LLM_results=pd.DataFrame()



answer_LLM=[]
for index, row in question_df.iterrows():
    answer=get_answer_RAG(row['Query'],model_select)
    answer_LLM.append(answer)
    print(answer)
LLM_results['answer_' + model_select]=answer_LLM

Using query:
PREFIX schema: <https://schema.org/>

SELECT ?ratedCapacity
WHERE {{
	?thing schema:name "A123 A123 20Ah" .
	?thing <{hasProperty}> ?property1 .
	?property1 a <{RatedCapacity}> ;
			<{hasNumericalPart}> ?numericalPart1 .
	?numericalPart1 <{hasNumericalValue}> ?ratedCapacity .
}}

Sparql answer:
[]


Unfortunately, the SPARQL query result is empty, which means that there is no data in the RDF graph that matches the query. Therefore, I cannot answer the question about the rated capacity of the A123 A123 20Ah battery based on the provided query results. If you would like to modify the query or provide more context, I'd be happy to try and help!
Using query:
PREFIX schema: <https://schema.org/>
SELECT ?ratedCapacity
WHERE {{
	?thing schema:name "A123 A123 26Ah" .
	?thing <{hasProperty}> ?property .
	?property a <{RatedCapacity}> ;
				<{hasNumericalPart}> ?numericalPart .
	?numericalPart <{hasNumericalValue}> ?ratedCapacity .
}}

Sparql answer:
[]


There is no result from a S

In [None]:
# LLM_results.to_parquet('LLM_results.parquet')