In [1]:
import pandas as pd
import json
from datetime import datetime
import time
import os
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage

from neo4j import GraphDatabase
from py2neo.data import Relationship, Node
from py2neo import Graph, NodeMatcher, RelationshipMatcher
from typing import Iterable

from wawr.llm_interface import KGPromptWriter, KGFeedback, KGGenerator, KGRecomposer


OPENAI_API = "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"
AURA_CONNECTION_URI = "neo4j+s://6af62c90.databases.neo4j.io"
AURA_USERNAME = "neo4j"
AURA_PASSWORD = "sdjmcGlnKaiqYXUExuFLSZTY52KKSnv8LYJ4pJer4oo"

In [2]:
#llm = OpenAI(model_name = 'gpt-3.5-turbo-16', openai_api_key=OPENAI_API)
cllm = ChatOpenAI(openai_api_key=OPENAI_API, model_name='gpt-3.5-turbo-16k-0613')
memory = ConversationBufferMemory()

n4jdriver = GraphDatabase.driver(
    AURA_CONNECTION_URI,
    auth=(AURA_USERNAME, AURA_PASSWORD)
)

graph = Graph(AURA_CONNECTION_URI, auth=(AURA_USERNAME, AURA_PASSWORD))

In [3]:
ds = pd.read_csv('../data/2023_09_20_17_39_lm.csv')
ds['abstract'] = ds['abstract'].str.replace('\n', ' ')
ds['update_date'] = pd.to_datetime(ds['update_date'])
ds = ds.sort_values(by='update_date', ascending=False)
ds = ds[ds['update_date'].dt.year == 2023]
with open('../data/kg_extraction_examples/1', 'r') as f:
    examples = f.read()

In [4]:
def query_one_abstract(abstract):

    system_msg = f'You are an agent helping to build a knowledge graph of machine learning advancements extracted from research papers. You receive a research paper abstract. Answer with knowledge graph paths you would extract from it, in json format, as per the example below. Limit to at most 20 nodes. Keep names short. Name the nodes such that that you maximise the chances of consistent naming across multiple requests. As an example, for this abstract: "{ds["abstract"].iloc[2]}" a part of the answer should look like this: . '
    messages = [
        SystemMessage(
            content=""#system_msg
        ),
        HumanMessage(
            content = examples + '\\nAbstract:\\n' + abstract + '\\nKnowledge graph representation:\\n'
        ),
    ]
    response = cllm(messages, temperature=0.1).content
    return response


In [6]:

prompt_writer = KGPromptWriter(cllm)
feedback_generator = KGFeedback(cllm)
kg_generator = KGGenerator(cllm )
kg_recomposer = KGRecomposer(cllm)

paper = ds.iloc[6]
#instructions = prompt_writer.act(paper, None, None, None)
#print("Instructions:\n", instructions)
#generated_kg = kg_generator.act(paper, instructions)
generated_kg = query_one_abstract(paper['abstract'])
print("KG:\n", generated_kg)
recomposed_abstract = kg_recomposer.act(generated_kg)
print("Abstract:\n", paper['abstract'], "\nRecomposition:\n", recomposed_abstract)
feedback = feedback_generator.act(paper['abstract'], recomposed_abstract)
print("Feedback:\n", feedback)

KG:
 [
{"from":{"type":"model", "name":"diffusion model"}, "relation":{"name":"generate", "summary":"diffusion models demonstrate a remarkable capability for generating high-quality images"}, "to":{"type":"concept","name":"high-quality images"}},

{"from":{"type":"model", "name":"diffusion model"}, "relation":{"name":"replicate", "summary":"their tendency to 'replicate' training data raises privacy concerns"}, "to":{"type":"concept","name":"privacy concerns"}},

{"from":{"type":"model", "name":"caption generality score"}, "relation":{"name":"measures", "summary":"our paper first introduces a generality score that measures the caption generality"}, "to":{"type":"concept","name":"caption generality"}},

{"from":{"type":"model", "name":"large language model"}, "relation":{"name":"generalize", "summary":"employ large language model (LLM) to generalize training captions"}, "to":{"type":"concept","name":"generalized captions"}},

{"from":{"type":"model", "name":"dual fusion enhancement appro