In [1]:
import pandas as pd
import json
from datetime import datetime
import time
import os
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage

from neo4j import GraphDatabase
from py2neo.data import Relationship, Node
from py2neo import Graph, NodeMatcher, RelationshipMatcher
from typing import Iterable
from langchain.callbacks import get_openai_callback


OPENAI_API = "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"
AURA_CONNECTION_URI = "neo4j+ssc://6af62c90.databases.neo4j.io"
AURA_USERNAME = "neo4j"
AURA_PASSWORD = "sdjmcGlnKaiqYXUExuFLSZTY52KKSnv8LYJ4pJer4oo"

In [2]:
#llm = OpenAI(model_name = 'gpt-3.5-turbo-16', openai_api_key=OPENAI_API)
cllm = ChatOpenAI(openai_api_key=OPENAI_API, model_name='gpt-4-1106-preview')
memory = ConversationBufferMemory()

n4jdriver = GraphDatabase.driver(
    AURA_CONNECTION_URI,
    auth=(AURA_USERNAME, AURA_PASSWORD)
)

graph = Graph(AURA_CONNECTION_URI, auth=(AURA_USERNAME, AURA_PASSWORD))

In [3]:
cllm.predict(
        'What is your name?'
        )

"I am an AI language model created by OpenAI, known as ChatGPT. I don't have a personal name like a human, but you can call me ChatGPT or simply AI if you prefer. How can I assist you today?"

In [4]:
ds = pd.read_csv('../data/2023_12_09_16_24_lm.csv')
ds['abstract'] = ds['abstract'].str.replace('\n', ' ').str.lower()
ds['date'] = pd.to_datetime(ds['date'])
ds = ds.sort_values(by='date', ascending=False)


In [5]:
ds = ds[ds['date'].dt.year >= 2014]

In [6]:
ds.shape

(15664, 17)

In [27]:
# 2311.13095
# 2311.12699
ds[ds['id'] == '2311.12699']['abstract'].iloc[0]

In [7]:
with open('../data/kg_extraction_prompt.txt', 'r') as f:
    examples = f.read()

In [18]:
def query_one_abstract(abstract):

    messages = [
        SystemMessage(
            content="""
                You are a helpful assistant generating a knowledge graph in json format, according to the examples received. Here are a few additional guidelines:

                1. Transform node names to their most abstract form. For instance: "large language models", "large language model (llm)", "very large language model" should all resolve to "large language model"

                2. Respect the style from the examples and reuse as many nodes as you can.
            """
        ),
        HumanMessage(
            content = examples + abstract + '\\n\\nResulting knowledge graph in json format:```json\\n\\n'
        ),
    ]
    with get_openai_callback() as usage:
        response = cllm(messages, temperature=0.1)
    return response, usage

def get_or_create_node(node_dict, existing_nodes):
    if 'name' not in node_dict or 'type' not in node_dict:
        return None
    to_ret = existing_nodes.get(node_dict['name']) or Node(node_dict['type'], name=node_dict['name'])
    existing_nodes[node_dict['name']] = to_ret
    return to_ret

def create_relation(node1, rel, node2, existing_relations, properties):
    to_ret = Relationship(node1, rel['name'], node2, **rel)
    existing_relations.append(to_ret)
    return to_ret

def convert_to_kg(response, existing_nodes, existing_relations, properties):
    try:
        response_json = json.loads(response)
    except Exception:
        response_json = json.loads(response.replace('\\', ''))
    for rel in response_json:
        node1 = get_or_create_node(rel['from'], existing_nodes)
        if not node1:
            return 
        node2 = get_or_create_node(rel['to'], existing_nodes)
        if not node2:
            return 
        properties['summary'] = rel['relation'].get('summary', 'No sunmmary provided')
        link = create_relation(node1, rel['relation'], node2, existing_relations, properties)
         
def get_existing_rel_names(existing_relations):
    rel_names = [type(er).__name__ for er in existing_relations]
    return set(rel_names)

def get_existing_node_names(existing_nodes):
    node_names = [er['name'] for er in existing_nodes.values()]
    return set(node_names)

def recreate_db(graph, nodes, relations):
    tx=graph.begin()
    graph.delete_all()
    graph.commit(tx)
    tx=graph.begin()
    for i, elem in enumerate(list(nodes.values()) + relations):
        if i % 1000 == 0:
            print(i)
        tx.create(elem)
    graph.commit(tx)
    
def relations_to_str(rel):
    rel_itr = [rel,] if not isinstance(rel, Iterable) else rel        
    to_ret = [f"{r.nodes[0]} - {type(r).__name__} - {r.nodes[1]}" for r in rel]
    return to_ret[0] if not isinstance(rel, Iterable) else to_ret       

In [9]:
ds['id']

15718    2311.13601
15717    2311.13581
15716    2311.13577
15715    2311.13565
15714    2311.13562
            ...    
59        1402.1128
58        1402.0574
57        1401.5382
56        1401.3896
55        1401.2258
Name: id, Length: 15664, dtype: object

In [15]:
args = [ds.iloc[i] for i in range(2000, 3000)]

In [None]:
def date_for_json(d: datetime):
    return d.strftime("%Y-%m-%d")

def fix_response(response):
    if 'json```' in response:
        response = response.split('json```')[1]
    elif '```json' in response:
        response = response.split('```json')[1]
    if '```' in response:
        response = response.split('```')[0]
    return response

def process_paper(paper):
    atts=0
    while atts<10:
        atts += 1
        try:
            file_name = f'../data/kg_json_v2/{paper["id"]}.json'
            if os.path.exists(file_name):
                logging.info(f'Skipping {paper["id"]}')
                break
            print("Starting attempt ",atts, " ", paper['id'], paper['title'])
            llm_response, _ = query_one_abstract(paper['abstract'])
            llm_response = fix_response(llm_response.content)
            properties=dict(paper[['date', 'title', 'id']])
            kg = json.loads(llm_response)
            for rel in kg:
                rel['relation'].update(properties)
            ds.loc[ds['id'] == paper['id'], 'processed'] = True
            with open(file_name, 'w') as f:
                f.write(json.dumps(kg, default=date_for_json))
            break
        except Exception as e:
            logging.error(e)
            logging.info(llm_response)
            time.sleep(5)
    print("Ending ", paper['id'], paper['title'])


from multiprocessing.pool import ThreadPool
import logging
logging.basicConfig(level=logging.INFO)

pool = ThreadPool(5)
with get_openai_callback() as usage:
    pool.map(process_paper, args)
    pool.close()
    pool.join()
print(usage)
 

In [14]:
 usage

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0

In [19]:
ds['processed'] = False
existing_nodes = {}
existing_relations = []
files = os.listdir('../data/kg_json_v2')
for i, file in enumerate(files):
    if (i+1)%1000 == 0:
        print(i)
    with open(os.path.join('..\\data\\kg_json_v2', file), 'r') as f:
        data_str = f.read()
    convert_to_kg(data_str, existing_nodes, existing_relations, {})
    

999
1999
2999


In [21]:
len(existing_relations)

29529

In [81]:
nodes = list(existing_nodes.values())
nodes = [[str(n.labels), n['name']] for n in nodes]
nodes = pd.DataFrame(nodes, columns=['labels', 'name'])


In [82]:
def fix_name(x):
    return x.replace('\'','')

In [89]:
nodes['name'] = nodes['name'].apply(fix_name)
nodes['name'].str.contains('\'').sum()
nodes['labels'] = nodes['labels'].str.replace(':','').str.replace('`','')

In [90]:
nodes

Unnamed: 0,labels,name
0,task,clinical trial eligibility identification
1,characteristic,natural language eligibility criteria
2,solution,text classification methods
3,dataset,Phase III cancer trial exclusions
4,data,764 Phase III cancer trials
...,...,...
29234,input,reference image segments
29235,capability,referring and generic segmentation
29236,model,universal visual in-context prompting model
29237,metric,PQ (Panoptic Quality)


In [91]:
nodes.to_csv('..\\data\\nodes.csv', index=False, quotechar='"')

In [52]:
existing_relations[0].nodes[0]

Node('task', name='clinical trial eligibility identification')

In [59]:
rels = [{'from':fix_name(r.nodes[0]['name']), 'to':fix_name(r.nodes[1]['name']), **dict(r)} for r in existing_relations]
rels

[{'from': 'clinical trial eligibility identification',
  'to': 'natural language eligibility criteria',
  'name': 'is complicated by',
  'research': 'problem',
  'summary': 'Automatic identification of clinical trials for which a patient is eligible is complicated by the fact that trial eligibility is stated in natural language',
  'date': '2023-09-14',
  'title': 'Text Classification of Cancer Clinical Trial Eligibility Criteria',
  'id': '2309.07812'},
 {'from': 'text classification methods',
  'to': 'clinical trial eligibility identification',
  'name': 'employed for',
  'research': 'approach',
  'summary': 'Text classification methods are employed for common types of eligibility criteria in clinical trials',
  'date': '2023-09-14',
  'title': 'Text Classification of Cancer Clinical Trial Eligibility Criteria',
  'id': '2309.07812'},
 {'from': 'Phase III cancer trial exclusions',
  'to': '764 Phase III cancer trials',
  'name': 'consists of',
  'research': 'context',
  'summary': 'T

In [60]:
rels=pd.DataFrame.from_dict(rels)

In [61]:
rels

Unnamed: 0,from,to,name,research,summary,date,title,id
0,clinical trial eligibility identification,natural language eligibility criteria,is complicated by,problem,Automatic identification of clinical trials fo...,2023-09-14,Text Classification of Cancer Clinical Trial E...,2309.07812
1,text classification methods,clinical trial eligibility identification,employed for,approach,Text classification methods are employed for c...,2023-09-14,Text Classification of Cancer Clinical Trial E...,2309.07812
2,Phase III cancer trial exclusions,764 Phase III cancer trials,consists of,context,The dataset consists of 764 Phase III cancer t...,2023-09-14,Text Classification of Cancer Clinical Trial E...,2309.07812
3,Transformer,exclusion criteria classification,experimented with,method,Common Transformer models are experimented wit...,2023-09-14,Text Classification of Cancer Clinical Trial E...,2309.07812
4,Clinical Trial BERT,Transformer,is a,context,A new pre-trained Clinical Trial BERT model is...,2023-09-14,Text Classification of Cancer Clinical Trial E...,2309.07812
...,...,...,...,...,...,...,...,...
29524,universal visual in-context prompting,reference image segments,enhanced to take,method,The framework is enhanced to take an arbitrary...,2023-11-22,Visual In-Context Prompting,2311.13601
29525,referring and generic segmentation,universal visual in-context prompting,elicited by,finding,The proposed visual in-context prompting elici...,2023-11-22,Visual In-Context Prompting,2311.13601
29526,competitive performance,universal visual in-context prompting,achieved by,finding,The model yields competitive performance to cl...,2023-11-22,Visual In-Context Prompting,2311.13601
29527,universal visual in-context prompting model,PQ (Panoptic Quality),achieves,result,"By joint training on COCO and SA-1B, the model...",2023-11-22,Visual In-Context Prompting,2311.13601


In [101]:
node_con = rels[['from', 'to']].stack().reset_index(drop=True)
node_con.value_counts().iloc[20:50]

reinforcement learning from human feedback    62
pretrained language model                     60
instruction tuning                            58
artificial intelligence                       57
NLP tasks                                     55
reinforcement learning                        53
code generation                               53
hallucinations                                52
state-of-the-art                              50
LLaMA                                         49
human evaluation                              48
extensive experiments                         47
downstream tasks                              47
BERT                                          47
logical reasoning                             45
GPT-3                                         45
text generation                               43
this study                                    41
prompting                                     41
few-shot learning                             40
accuracy            

In [88]:
rels.to_csv('..\\data\\rels.csv', index=False, quotechar='"')

In [None]:
"""
LOAD CSV WITH HEADERS FROM 'https://drive.google.com/uc?export=download&id=1eKfvVWMlbtRZCV3HZHtwDAnW6GAhOsVm' AS row CALL {with row call apoc.create.node(row.label, {name:row.name}) yield node return null as n} return count(n)

create index node_name if not exists for (n:ALL) on (n.name)

LOAD CSV WITH HEADERS FROM 'https://drive.google.com/uc?export=download&id=1AAgs7x9DLO7lecNcVfL-Zgz02ZamAAFv' AS row MATCH (n1 {name:row.from}) MATCH (n2 {name:row.to}) CALL apoc.create.relationship(n1, row.name, {research: row.research, summary:row.summary, date:row.date, title:row.title, id:row.id}, n2) YIELD rel RETURN count(rel)

"""

In [23]:
recreate_db(graph, existing_nodes, existing_relations)


0
1000
2000


In [65]:
len(existing_relations)

8983

In [21]:
recreate_db(graph, existing_nodes, existing_relations)


In [None]:
existing_nodes

In [None]:
node_names = [er['name'] for er in existing_nodes.values()]
node_names = set(node_names[:500])
len(node_names)
messages = [
        SystemMessage(
            content='This is a list of nodes in a graph database. Identify the ones that are identical in meaning and could be merged into a single node.'
        ),
        HumanMessage(
            content=str(node_names)
        ),
    ]
response = cllm(messages, temperature=0.1).content
response

In [68]:
ds['abstract'][3 ]

'  We present an approach to improving the precision of an initial document\nranking wherein we utilize cluster information within a graph-based framework.\nThe main idea is to perform re-ranking based on centrality within bipartite\ngraphs of documents (on one side) and clusters (on the other side), on the\npremise that these are mutually reinforcing entities. Links between entities\nare created via consideration of language models induced from them.\n  We find that our cluster-document graphs give rise to much better retrieval\nperformance than previously proposed document-only graphs do. For example,\nauthority-based re-ranking of documents via a HITS-style cluster-based approach\noutperforms a previously-proposed PageRank-inspired algorithm applied to\nsolely-document graphs. Moreover, we also show that computing authority scores\nfor clusters constitutes an effective method for identifying clusters\ncontaining a large percentage of relevant documents.\n'

In [77]:
texts = response.content.replace('Result: ', '').split(',')
df_list = [t.replace('{', '').replace('}','').split(':') for t in texts]
print(df_list)
#pd.DataFrame(df_list, columns=['Key', 'Value'])

[['topic', 'graphical modeling'], [' topic', 'non-negative matrix factorization'], [' application', 'computational auditory scene analysis'], [' application', 'music transcription'], [' application', 'source separation'], [' application', 'speech recognition'], [' contribution', 'inference and learning algorithms'], [' contribution', 'hierarchical state transition model'], [' contribution', 'target tracking'], [' contribution', 'extracting hierarchical features'], [' contribution', 'language modeling\nAbstract', '   We present a novel approach for sentiment analysis using deep learning\ntechniques. Our model utilizes a convolutional neural network (CNN) to capture\nlocal patterns in text data and a long short-term memory (LSTM) network to\ncapture long-range dependencies. We also incorporate word embeddings to\nrepresent the semantic meaning of words. The model is trained on a large\nlabeled dataset and achieves state-of-the-art performance on several benchmark\ndatasets. We conduct ex

ValueError: 2 columns passed, passed data had 4 columns

In [36]:
content = f'What does this paper improve? Answer in as few words as possible. Abstract: {ds["abstract"][3]}'
messages = [
    SystemMessage(
        content='None'
    ),
    HumanMessage(
        content=content
    ),
]
cllm(messages, temperature=0.4)

AIMessage(content='This paper improves document ranking precision using cluster information in a graph-based framework.', additional_kwargs={}, example=False)

In [38]:
content = f'What are the main contributions of this paperr? Answer in as few words as possible. Abstract: {ds["abstract"][3]}'
messages = [
    SystemMessage(
        content='None'
    ),
    HumanMessage(
        content=content
    ),
]
cllm(messages, temperature=0.4)

AIMessage(content='The main contributions of this paper are:\n- Utilizing cluster information to improve document ranking\n- Re-ranking based on centrality in bipartite graphs of documents and clusters\n- Creating links between entities using language models\n- Outperforming previously proposed algorithms for document ranking\n- Identifying clusters with a high percentage of relevant documents using authority scores.', additional_kwargs={}, example=False)

In [9]:

system_msg = 'Answer each question about the given abstract in as few words as possible, without repeating words from the question: 1. What is the main topic or subject of the research?\n2. What is the objective or purpose of the research?\n3. What are the key findings or results mentioned in the abstract?\n4. What methodologies or techniques were used in the study?\n5. What are the datasets or resources mentioned?\n6. What are the implications or applications of the research mentioned? Example: "This paper describes experiments on identifying the language of a single name\nin isolation or in a document written in a different language. A new corpus has\nbeen compiled and made available, matching names against languages. This corpus\nis used in a series of experiments measuring the performance of general\nlanguage models and names-only language models on the language identification\ntask. Conclusions are drawn from the comparison between using general language\nmodels and names-only language models and between identifying the language of\nisolated names and the language of very short document fragments. Future\nresearch directions are outlined.\n" should result in "1. language_identification.names; 2.comparison; 3. not specified; 4. not specified; 5. new_dataset.noname; 6. clarify '

messages = [
    SystemMessage(
        content=system_msg
    ),
    HumanMessage(
        content=ds['abstract'][2]
    ),
]
response = cllm(messages, temperature=0.1)
print(response.content)
system_msg = 'Get the entities from the response'

messages = [
    SystemMessage(
        content=system_msg
    ),
    HumanMessage(
        content=response.content
    ),
]
cllm(messages, temperature=0.1)

1. What is the main topic or subject of the research?
word prediction systems and the use of Latent Semantic Analysis (LSA)
2. What is the objective or purpose of the research?
to explore the predictive powers of LSA and evaluate its integration with a standard language model
3. What are the key findings or results mentioned in the abstract?
all methods integrating LSA-based information showed significant improvements compared to the baseline and a simple cache model
4. What methodologies or techniques were used in the study?
Latent Semantic Analysis (LSA) and integration with a standard language model
5. What are the datasets or resources mentioned?
not specified
6. What are the implications or applications of the research mentioned?
improving word prediction systems by integrating LSA-based information


AIMessage(content='Entities:\n1. Latent Semantic Analysis (LSA)\n2. word prediction systems\n3. standard language model\n4. baseline\n5. cache model', additional_kwargs={}, example=False)

In [9]:
ds['abstract'][1]

'  This paper describes experiments on identifying the language of a single name\nin isolation or in a document written in a different language. A new corpus has\nbeen compiled and made available, matching names against languages. This corpus\nis used in a series of experiments measuring the performance of general\nlanguage models and names-only language models on the language identification\ntask. Conclusions are drawn from the comparison between using general language\nmodels and names-only language models and between identifying the language of\nisolated names and the language of very short document fragments. Future\nresearch directions are outlined.\n'

In [10]:
system_msg = 'List the main entities from a research perspective.'

messages = [
    SystemMessage(
        content=system_msg
    ),
    HumanMessage(
        content=ds['abstract'][1].replace('\n', ' ')
    ),
]
cllm(messages, temperature=0.1)


AIMessage(content='1. Language identification task\n2. Names\n3. Languages\n4. Corpus\n5. General language models\n6. Names-only language models\n7. Performance measurement\n8. Document fragments\n9. Comparison analysis\n10. Conclusions\n11. Future research directions', additional_kwargs={}, example=False)

In [12]:
system_msg = 'List relations between the following 2 entities from the given abstract, using at most 2 words per relation: 1. Language identification task\n2. Names'

messages = [
    SystemMessage(
        content=system_msg
    ),
    HumanMessage(
        content=ds['abstract'][1].replace('\n', ' ')
    ),
]
cllm(messages, temperature=0.1)


AIMessage(content='1. Identification task\n2. Language\n3. Name\n4. Experiments\n5. Isolation\n6. Document\n7. Corpus\n8. Performance\n9. Models\n10. Comparison\n11. Fragments\n12. Research', additional_kwargs={}, example=False)