The purpose of this notebook is to prototype how OpenIE6 will be used to annotate raw text information

For a fuller explanation, go and see the references here on Notion:

In [8]:
# Download data in the form of pubmed files and convert it to text
from lit_data import pdf_to_text

pdf_to_text.process_query_item('test') 


# By now we should have text

test


In [7]:
# Turn it into sentences

with open('text/nihms-1536901.txt', 'r') as f:

    total_string = ""

    for line in f:

        total_string += line
    

    # Now strip the linebreaks

    total_string = total_string.replace("\n", "")

    total_string = total_string.split(".")


    with open('text/nihms-1536901-sentences.txt', 'w') as f2:

        for sentence in total_string:
            #print(sentence +'.' + '\n')
            f2.write(sentence + '.' + '\n')

    

In [13]:
%load_ext autoreload
%autoreload 2
# Preprocess the data

# Imports for preprocessing

import os
from PyPDF2 import PdfReader
import PyPDF2
import requests
from bs4 import BeautifulSoup
from scholarly import scholarly
from scholarly import ProxyGenerator
import pandas as pd
import chunk
import boto3
import nltk
from collections import defaultdict
import numpy
from numpy import real

from io import StringIO # python3; python2: BytesIO 
import boto3
import json
import gzip
import multiprocessing
import nltk
from nltk.corpus import stopwords
import random
import string
import spacy
from celery import Celery
from celery.utils.log import get_task_logger
import traceback

nlp = spacy.load("en_core_sci_lg")
nlp2 = spacy.load("en_core_web_sm")

In [14]:
# Helper functions


def approve(i):
    words = i.split()
    for word in words:
        if word in ["Vol.","Journal","2019","2018","2020"]:
            return False
        if "www" in word:
            return False
        if ".com" in word:
            return False
        if ".edu" in word:
            return False
        if "License" in word:
            return False
        if "doi" in word or "ncbi" in word or "Vol." in word:
            return False
    if len(i) < 25:
        return False
    if i.count(",") > 6:
        return False
    if i.count(".") > 2:
        return False
    return True


# This method takes text and then returns it as a token of sentences.
# From the docs:
# 
# nltk.tokenize.sent_tokenize(text, language='english')
#
# Return a sentence-tokenized copy of text, using NLTK’s recommended 
# sentence tokenizer (currently PunktSentenceTokenizer for the 
# specified language).
#
#
# content = a string containing the text to be parsed
# from_path = a data field specific to pubmed publications that has a pointer to where the plaintext 
#             of that article is
def retrieve_metadata(content, from_path = None):

    # Get the sentences
    sent_text = nltk.sent_tokenize(content)

    buffer = ""
    text = defaultdict(list)

    for count, i in enumerate(sent_text):
        if approve(i):
            if ":" in i:
                sample = i.partition(":")[0]
                if len(sample) < 20:
                    buffer = sample
                    i = i.replace(buffer + ":", "")

            temp_context = []
            if count > 0:
                temp_context.append(sent_text[count-1])
            temp_context.append(i)
            if count < len(sent_text)-1:
                temp_context.append(sent_text[count+1])
            
            text["context:String"].append(" ".join(temp_context))
            text["sentence:String"].append(i)
            text["tag:String"].append(buffer)
            text["path:String"].append(from_path)  
    
    df = pd.DataFrame().from_dict(text)
    return df

def find_key_words(dictionary):
    doc = nlp(dictionary["sentence:String"])
    entities = doc.ents
    real_arg1 = dictionary["arg1:String"].lower()
    real_arg2 = dictionary["arg2:String"].lower()
    FLAG_first = False
    FLAG_second = False
    for ent in entities:
        if ent.text in dictionary["arg1:String"]: # overwrites because last search words are better
            real_arg1 = ent.text
            FLAG_first = True
        if ent.text in dictionary["arg2:String"]:
            real_arg2 = ent.text
            FLAG_second = True
    doc = nlp2(dictionary["sentence:String"])
    for chunk in doc.noun_chunks:
        if FLAG_first == False:
            if chunk.root.text in dictionary["arg1:String"]:
                real_arg1 = nlp2(chunk.root.text)[0].lemma_
        
        if FLAG_second == False:
            if chunk.root.text in dictionary["arg2:String"]:
                real_arg2 = nlp2(chunk.root.text)[0].lemma_
    try:
        if real_arg1[0] == dictionary["sentence:String"][0] and str.islower(real_arg1[1]):
            real_arg1 = real_arg1.lower()
    except:
        pass
    if real_arg1 == real_arg2 \
        or real_arg1.lower() in stopwords.words('english') \
        or real_arg2.lower() in stopwords.words('english'):
        return False, None, None
    else:
        return True, real_arg1, real_arg2

def return_random_string(N=20):
    return ''.join(random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))


# This formats data into data frames necessary for functioning
def URANUS_redone(results):
    df_nodes = {"~id":[],"name:String":[]}
    df_edges = pd.DataFrame()

    for i,row in results.iterrows():
        FLAG, real_arg1, real_arg2 = find_key_words(row)
        if FLAG:

            df_nodes["~id"].append("LIT=" + str(real_arg1))
            df_nodes["name:String"].append(str(real_arg1))

            df_nodes["~id"].append("LIT=" + str(real_arg2))
            df_nodes["name:String"].append(str(real_arg2))

            temp_row = row
            temp_row["~from"] = "LIT=" + str(real_arg1)
            temp_row["~to"] = "LIT=" + str(real_arg2)
            temp_row["~id"] = return_random_string() + "==" + "LIT=" + str(real_arg1) + "_TO_" + "LIT=" + str(real_arg2)

            df_edges = df_edges.append(temp_row, ignore_index=True)
    
    nodes_all = pd.DataFrame().from_dict(df_nodes)
    return nodes_all, df_edges


In [19]:
input_dir = "text"
from lit_data.REL_HELPERS import *
for filename in os.listdir(input_dir):
    total_nodes = pd.DataFrame()
    total_edges = pd.DataFrame()
    try:
        filepath = os.path.join(input_dir, filename)
        with open(filepath, 'r') as file:
            data_lines = ''.join([line.strip() for line in file.readlines()])

        #print(dois.loc[filename[:-4]]['DOI'])
        #df = retrieve_metadata(str(data_lines), dois.loc[filename[:-4]]['DOI'])#GET DOI FOR THE PAPER
        df = retrieve_metadata(str(data_lines)) 
        print('metadata retrieved')
        # Here is the step for calling the extraction functions
        results = kill_bill_and_get_extractions(df)
        print("kill bill done")
        
        # This is to format the results
        temp_node_csv, temp_edge_csv = URANUS_redone(results)
        temp_node_csv["~label"] = "Scientific Literature"
        temp_edge_csv["~label"] = "Scientific Literature"

        total_nodes = total_nodes.append(temp_node_csv, ignore_index=True)
        total_edges = total_edges.append(temp_edge_csv, ignore_index=True)
        print(total_nodes.shape)
        print(total_edges.shape)
        #data_lines.close()
        

        if total_edges.shape[0] > 0: #remove after testing
            # specify the file path and file name
            #file_path_nodes = output_dir+'/df_nodes_{filename}.csv'
            #file_path_edges = output_dir+'/df_edges_{filename}.csv'

            # save the DataFrame to a CSV file
            #total_nodes.to_csv(file_path_nodes, index=False)
            #total_edges.to_csv(file_path_edges, index=False)
            # Save processed data to file in output_dir
            output_filename = f"{filename[:-4]}_processed.csv" # strip ".txt" and add "_processed.csv"
            output_filepath_nodes = os.path.join(output_dir, f"df_nodes_{output_filename}")
            output_filepath_edges = os.path.join(output_dir, f"df_edges_{output_filename}")
            total_nodes.to_csv(output_filepath_nodes, index=False)
            total_edges.to_csv(output_filepath_edges, index=False)
    except Exception as e:
        print(f"Error processing {filename}: {e}")
        tb_str = traceback.format_exc()
        print(tb_str)


  0%|          | 0/326 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

metadata retrieved
Kill bill called
Extractor called
not enough values to unpack (expected 2, got 0)
kill bill done
(0, 3)
(0, 1)





In [None]:
# Run the OpenIE6 