In [1]:
import requests
import pdf2image
import pytesseract

In [2]:
pdf = requests.get('https://arxiv.org/pdf/2110.03526.pdf')
doc = pdf2image.convert_from_bytes(pdf.content)

In [34]:
article = []
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data).encode("utf-8")
    # Sixth page are only references
    if page_number < 6:
        article.append(txt.decode("utf-8"))
article_txt = " ".join(article)

In [35]:
import nltk
nltk.download('punkt')
def clean_text(txt):
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in txt.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return clean
txt = article_txt.split("INTRODUCTION")[1]
ctext = clean_text(txt)
sentences = nltk.tokenize.sent_tokenize(ctext)
print(sentences)

['Many people with skin diseases such as chronic wounds, non-healing and diabetic\nulcers need reconstruction and regeneration of their skin.', 'In addition, the medical industry also\nneeded a method of skin rejuvenation and reconstruction for cosmetic purposes, even for\nhealthy people.', 'Reconstructive medicine used the method to deliver pluripotent stem cells to the\n33 years after the introduction of bone marrow stem cells, fat-derived stem cells have\nbecome an excellent source for cell therapy.', 'In 1961, two Canadian scientists first introduced\nstem cells.', 'These cells, later found to be hematopoietic stem cells, have been used successfully\nto treat leukemia and some severe autoimmune diseases called bone marrow transplants.', 'In\n1968, another stem cell was introduced into the bone marrow, which has been shown to be\neffective due to its high ability to regulate immunity in many diseases, including skin, bone, joint\ndiseases, heart, brain, nerves, and kidney.', 'Nevert

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Olivier\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import hashlib

def query_plain(text, url='http://bern2.korea.ac.kr/plain'):
    return requests.post(url, json={'text':str(text)}).json()

entity_list = []
for s in sentences[:-1]:
    entity_list.append(query_plain(s))
    
parsed_entities = []
for entities in entity_list:
    e = []
    if not entities.get('annotations'):
        parsed_entities.append({'text':entities['text'], 'text_sha256':
                    hashlib.sha256(entities ['text'].encode('utf-8')).hexdigest()})
        continue
    for entity in entities['annotations']:
        other_ids = [id for id in entity['id'] if not id.startswith('BERN')]
        entity_type = entity['obj']
        entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]
        try:
            entity_id = [id for id in entity['id'] if id.startswith('BERN')][0]
        except IndexError:
            entity_id = entity_name
        e.append({'entity_id': entity_id, 'other_ids': other_ids,'entity_type': entity_type, 'entity': entity_name})
    parsed_entities.append({'entities':e, 'text':entities['text'],'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
print(parsed_entities)

In [None]:
from neo4j import GraphDatabase
import pandas as pd

host = 'bolt://3.222.189.78:7687'
user = 'neo4j'
password = 'river-windows-stake'
driver = GraphDatabase.driver(host, auth=(user, password))

In [None]:
def neo4j_query(query, params=None):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([dict(record) for record in result],
        columns=result.keys())

author = article_txt.split('\n')[0]
title = " ".join(article_txt.split('\n')[2:4])

In [None]:
neo4j_query("""
    MERGE (a:Author{name:$author})
    MERGE (b:Article{title:$title})
    MERGE (a)-[:WROTE]->(b)
    """, {'title':title, 'author':author})

In [None]:
neo4j_query("""
    MATCH (a:Article)
    UNWIND $data as row
    MERGE (s:Sentence{id:row.text_sha256})
    SET s.text = row.text
    MERGE (a)-[:HAS_SENTENCE]->(s)
    WITH s, row.entities as entities
    UNWIND entities as entity
    MERGE (e:Entity{id:entity.entity_id})
    ON CREATE SET e.other_ids = entity.other_ids,
    e.name = entity.entity,
    e.type = entity.entity_type
    MERGE (s)-[m:MENTIONS]->(e)
    ON CREATE SET m.count = 1
    ON MATCH SET m.count = m.count + 1
    """, {'data': parsed_entities})

In [None]:
neo4j_query("""
MATCH (e:Entity)<-[:MENTIONS]-(s:Sentence)
WHERE e.name = "autoimmune diseases"
RETURN s.text as result
""")

In [None]:
neo4j_query("""
MATCH (e1:Entity)<-[:MENTIONS]-()-[:MENTIONS]->(e2:Entity)
WHERE id(e1) < id(e2)
RETURN e1.name as entity1, e2.name as entity2, count(*) as cooccurrence
ORDER BY cooccurrence
DESC LIMIT 3
""")

In [None]:
from transformers import AutoTokenizer
from zero_shot_re import RelTaggerModel, RelationExtractor
model = RelTaggerModel.from_pretrained("fractalego/fewrel-zero-shot")
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
relations = ['associated', 'interacts']
extractor = RelationExtractor(model, tokenizer, relations)

In [None]:
import itertools

candidates = [s for s in parsed_entities if (s.get('entities')) and(len(s['entities']) > 1)]
predicted_rels = []
for c in candidates:
    combinations = itertools.combinations([{'name':x['entity'],'id':x['entity_id']} for x in c['entities']], 2)
    for combination in list(combinations):
        try:
            ranked_rels = extractor.rank(text=c['text'].replace(",", ""), head=combination[0]['name'], tail=combination[1]['name'])
            if ranked_rels[0][1] > 0.85:
                predicted_rels.append({'head': combination[0]['id'],'tail': combination[1]['id'], 'type':ranked_rels[0][0], 'source':c['text_sha256']})
        except:
            pass

In [None]:
neo4j_query("""
UNWIND $data as row
MATCH (source:Entity {id: row.head})
MATCH (target:Entity {id: row.tail})
MATCH (text:Sentence {id: row.source})
MERGE (source)-[:REL]->(r:Relation {type: row.type})-[:REL]->(target)
MERGE (text)-[:MENTIONS]->(r)
""", {'data': predicted_rels})

In [None]:
neo4j_query("""
MATCH (s:Entity)-[:REL]->(r:Relation)-[:REL]->(t:Entity), (r)<-
[:MENTIONS]-(st:Sentence)
RETURN s.name as source_entity, t.name as target_entity, r.type as
type, st.text as source_text
""")

In [28]:
pdf = requests.get('https://arxiv.org/ftp/arxiv/papers/2311/2311.15662.pdf')
doc = pdf2image.convert_from_bytes(pdf.content)

In [31]:
article = []
for page_number, page_data in enumerate(doc):
    txt = pytesseract.image_to_string(page_data).encode("utf-8")
    if page_number >1 and page_number < 29:
        article.append(txt.decode("utf-8"))
article_txt = " ".join(article)
print(article_txt)

Abstract

The epidermis is a specialized epithelium that constitutes the outermost layer of the
skin, and it provides a protective barrier against environmental assaults. Primarily
consisting of multi-layered keratinocytes, the epidermis is continuously renewed by
proliferation of stem cells and the differentiation of their progeny, which undergo
terminal differentiation as they leave the basal layer and move upward toward the
surface, where they die and slough off. Basal keratinocytes rest on a basement
membrane at the dermal-epidermal junction that is composed of specific extracellular
matrix proteins organized into interactive and mechanically supportive networks. Firm
attachment of basal keratinocytes, and their dynamic regulation via focal adhesions
and hemidesmosomes, are essential for maintaining major skin processes, such as
self-renewal, barrier function, and resistance to physical and chemical stresses. The
adhesive integrin receptors expressed by epidermal cells serve struct

In [32]:
import nltk
nltk.download('punkt')
def clean_text(txt):
    """Remove section titles and figure descriptions from text"""
    clean = "\n".join([row for row in txt.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
    return clean
txt = article_txt.split("Abstract")[1]
ctext = clean_text(txt)
sentences = nltk.tokenize.sent_tokenize(ctext)
print(sentences)

['The epidermis is a specialized epithelium that constitutes the outermost layer of the\nskin, and it provides a protective barrier against environmental assaults.', 'Primarily\nconsisting of multi-layered keratinocytes, the epidermis is continuously renewed by\nproliferation of stem cells and the differentiation of their progeny, which undergo\nterminal differentiation as they leave the basal layer and move upward toward the\nsurface, where they die and slough off.', 'Basal keratinocytes rest on a basement\nmembrane at the dermal-epidermal junction that is composed of specific extracellular\nmatrix proteins organized into interactive and mechanically supportive networks.', 'Firm\nattachment of basal keratinocytes, and their dynamic regulation via focal adhesions\nand hemidesmosomes, are essential for maintaining major skin processes, such as\nself-renewal, barrier function, and resistance to physical and chemical stresses.', 'The\nadhesive integrin receptors expressed by epidermal cel

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Olivier\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [36]:
import hashlib

def query_plain(text, url='http://bern2.korea.ac.kr/plain'):
    return requests.post(url, json={'text':str(text)}).json()

entity_list = []
for s in sentences[:-1]:
    entity_list.append(query_plain(s))
    
parsed_entities = []
for entities in entity_list:
    e = []
    if not entities.get('annotations'):
        parsed_entities.append({'text':entities['text'], 'text_sha256':
                    hashlib.sha256(entities ['text'].encode('utf-8')).hexdigest()})
        continue
    for entity in entities['annotations']:
        other_ids = [id for id in entity['id'] if not id.startswith('BERN')]
        entity_type = entity['obj']
        entity_name = entities['text'][entity['span']['begin']:entity['span']['end']]
        try:
            entity_id = [id for id in entity['id'] if id.startswith('BERN')][0]
        except IndexError:
            entity_id = entity_name
        e.append({'entity_id': entity_id, 'other_ids': other_ids,'entity_type': entity_type, 'entity': entity_name})
    parsed_entities.append({'entities':e, 'text':entities['text'],'text_sha256': hashlib.sha256(entities['text'].encode('utf-8')).hexdigest()})
print(parsed_entities)

ConnectionError: ('Connection aborted.', ConnectionAbortedError(10053, 'Une connexion établie a été abandonnée par un logiciel de votre ordinateur hôte', None, 10053, None))