In [None]:
import os
import pandas as pd
import numpy as np
from neo4j import Query, GraphDatabase, RoutingControl, Result 
from graphdatascience import GraphDataScience
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
## Change this
os.environ['OPENAI_API_KEY'] = "" 
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "sdtm" 

In [None]:
driver = GraphDatabase.driver(DB_ULR, auth=(DB_USER, DB_PASS))
driver.verify_connectivity()

In [None]:
gds = GraphDataScience(driver)
gds.set_database(DB_NAME)
gds.version()

In [None]:
## Utility
def split_dataframe(df, chunk_size = 5000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

# Generate embeddings
![image](./images/generate_embeddings.png)

In [None]:
df_pharmacological_group = gds.run_cypher('''
    match (n:PharmacologicalSubgroup)
    return n.id as id, n.name as name 
''')
df_pharmacological_group.shape

In [None]:
df_pharmacological_group.head()

In [None]:
embeddings = OpenAIEmbeddings(
    deployment="your-embeddings-deployment-name",
    model='text-embedding-3-small', # 1536
    #openai_api_base="https://your-endpoint.openai.azure.com/",
    #openai_api_type="azure",
)

In [None]:
df_pharmacological_group['embedding'] = df_pharmacological_group['name'].apply( lambda name: embeddings.embed_documents([name])[0])

In [None]:
df_pharmacological_group.head()

# Store embeddings and index

In [None]:
df_pharmacological_group = gds.run_cypher('''
    unwind $data as row
    match (n:PharmacologicalSubgroup{id: row.id})
    set n.embedding = row.embedding                                                                             
''',
params = { 'data': df_pharmacological_group.to_dict(orient='records') })

In [None]:
dimension=1536
index_name = 'pharmacological'

In [None]:
gds.run_cypher('''
    CREATE VECTOR INDEX $index_name if not exists
    for (n:PharmacologicalSubgroup) on (n.embedding)
    OPTIONS {
        indexConfig: {
            `vector.dimensions`: $dimension,
            `vector.similarity_function`: 'cosine'       
        }
    }
''',
    params = { 'index_name': index_name, 'dimension': dimension }
)

In [None]:
# Verify that index is online and populated 
gds.run_cypher('''
    show index yield name, state, populationPercent, type, labelsOrTypes, properties
    where name = $index_name
    return *
''',
    params = { 'index_name': index_name, 'dimension': dimension }
).head()

# Vector search

![image](./images/vector_search.png)

In [None]:
question = 'What drugs are used to treat type 2 diabetes?'
q_vector = embeddings.embed_query(text=question)

In [None]:
gds.run_cypher(''' 
    call db.index.vector.queryNodes($index_name, 5, $vector) yield node, score
    return node.id as id, node.name as name, score
''',
    params = {'index_name': index_name, 'vector': q_vector}
).head()

In [None]:
df_group_and_drugs = gds.run_cypher(''' 
    call db.index.vector.queryNodes($index_name, 5, $vector) yield node, score
    match (node)-[:DrugClassHierarchy*1..10]->(d:Drug)
    return node.id as id, node.name as name, score, collect(d.name) as drugs
''',
    params = {'index_name': index_name, 'vector': q_vector}
)

In [None]:
df_group_and_drugs.head()

# Agmented Generation

In [None]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [None]:
promt = ChatPromptTemplate.from_template('''
    Answer the question based only on the following context: {data}
                                         
    Question: {question}
''')
chain = promt | llm | StrOutputParser()
message =chain.invoke({ 'question': question, 'data': df_group_and_drugs[['name','drugs']].to_dict(orient='records') })

In [None]:
print(message)