In [97]:
import gzip
import io
import json
from tqdm import tqdm
import argparse
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import numpy as np
import math

device = torch.device(4)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if __name__=='__main__':
    print('Using device:', DEVICE)
    
folder = '/shared/data3/yanzhen4/MAPLE_CS/environments/pdf_parser/testFiles'
with open(f'{folder}/Agarwal_et_al._-_2015_-_Chemoenzymatic_Synthesis_of_Acyl_Coenzyme_A_Substr.json') as fin:
    contents = fin.read()

    # Strip any leading/trailing whitespace
    contents = contents.strip()

    # Parse the JSON data
    data = json.loads(contents)
    
    content = data['content']

Using device: cuda


In [93]:
from transformers import AutoTokenizer, AutoModel
mpnet_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
mpnet_model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2', output_hidden_states = True).to(device)

In [94]:
def mpnet_emb(text):
    input_ids = torch.tensor(mpnet_tokenizer.encode(text.lower(), max_length=512, truncation=True)).unsqueeze(0).to(device)
    input_ids = input_ids[:, :512]
    outputs = mpnet_model(input_ids)
    hidden_states = outputs[2][-1][0]
    emb = torch.mean(hidden_states, dim=0).to(device)
    
    emb = np.array([float(x) for x in emb])
    magnitude = np.linalg.norm(emb)
    if math.isnan(magnitude):
        emb = np.zeros(len(emb))
    else:
        emb = emb / magnitude
            
    return emb

def similarity(fragment1, fragment2):
    # convert sequence to id
    fragment1_emb = mpnet_emb(fragment1)
    fragment2_emb = mpnet_emb(fragment2)
    
    cosine_similarity = np.dot(fragment1_emb, fragment2_emb)
    
    return cosine_similarity

def isFigureDiscription(text):
    
    text_lower = text.lower()
    tokens = text_lower.split(' ')
    if tokens[0] == 'scheme' or tokens[0] == 'figure' :
        if len(tokens) < 15:
            return True
    
    return False

In [98]:
for idx, paragraph in enumerate(content):
    print(idx, len(paragraph), paragraph)
    print()

0 111 Chemoenzymatic Synthesis of Acyl Coenzyme A Substrates Enables in Situ Labeling of Small Molecules and Proteins

1 492 Vinayak Agarwal,†,⊥ Stefan Diethelm,‡,⊥ Lauren Ray,‡,⊥ Neha Garg,§ Takayoshi Awakawa,‡,∥ Pieter C. Dorrestein,§ and Bradley S. Moore*,†,‡,§ †Center for Oceans and Human Health and ‡Center for Marine Biotechnology and Biomedicine, Scripps Institution of Oceanography, §Skaggs School of Pharmacy and Pharmaceutical Sciences, University of California San Diego, La Jolla, California 92093, United States ∥Graduate School of Pharmaceutical Sciences, The University of Tokyo, Tokyo 113-0033, Japan

2 25 *S Supporting Information

3 537 ABSTRACT: A chemoenzymatic approach to generate fully functional acyl coenzyme A molecules that are then used as substrates to drive in situ acyl transfer reactions is described. Mass spectrometry based assays to verify the identity of acyl coenzyme A enzymatic products are also illustrated. The approach is responsive to a diverse array of c

In [88]:
def clean_paragraphs(paragraphs):
    
    cleaned_paragraphs = []

    curr_paragraph = ''
    curr_paragraph_idx = -1

    decay_rate = 0.99

    for idx, paragraph in enumerate(content):

        if isFigureDiscription(paragraph):
            continue

        distance = idx - curr_paragraph_idx
        score = similarity(curr_paragraph, paragraph) * (decay_rate ** distance)

        if score > 0.3 or curr_paragraph_idx == -1:
            cleaned_paragraphs.append(paragraph)
            curr_paragraph = paragraph
            curr_paragraph_idx = idx
    
    return cleaned_paragraphs

cleaned_paragraphs = clean_paragraphs(contents)

for idx, paragraph in enumerate(cleaned_paragraphs):
    print(idx, paragraph)
    print()

0 Chemoenzymatic Synthesis of Acyl Coenzyme A Substrates Enables in Situ Labeling of Small Molecules and Proteins

1 ABSTRACT: A chemoenzymatic approach to generate fully functional acyl coenzyme A molecules that are then used as substrates to drive in situ acyl transfer reactions is described. Mass spectrometry based assays to verify the identity of acyl coenzyme A enzymatic products are also illustrated. The approach is responsive to a diverse array of carboxylic acids that can be elaborated to their corresponding coenzyme A thioesters, with potential applications in wide-ranging chemical biology studies that utilize acyl coenzyme A substrates.

2 In biological chemistry, coenzyme A (CoA, 1, Scheme 1) actsas a molecular shuttle for carboxylic acids linked to its

3 terminal thiol. S-Acylated derivatives of 1 (acyl-CoAs, Scheme 1) participate in numerous enzymatic reactions, including primary energy metabolism, synthesis of biomolecules, post-translational modification of proteins, an

In [87]:
def concat_paragraphs(paragraphs):    
    complete_paragraphs = []
    curr_idx = 0
    paragraph = ''

    for idx, current_segment in enumerate(paragraphs):

        terminals = ['.', '?', '!']

        if (current_segment[0].isupper() and current_segment[-1] in terminals) and paragraph != '': #clean up trash
            paragraph = ''
            complete_paragraphs.append(current_segment)
        elif (current_segment[0].isupper() and current_segment[-1] in terminals) or idx == 0 or idx == len(cleaned_paragraphs) - 1: #Complete a paragraph
            complete_paragraphs.append(current_segment)
        elif current_segment[0].isupper() and current_segment[-1] not in terminals: #Start of a paragraph
            paragraph = current_segment
        elif current_segment[0].isupper() == False and current_segment[-1] not in terminals: #Middle of a paragraph
            paragraph += current_segment
        elif current_segment[0].isupper() == False and current_segment[-1] in terminals: #End of a paragraph
            paragraph += current_segment
            complete_paragraphs.append(paragraph)
            paragraph = ''
    
    return complete_paragraphs
    
complete_paragraphs = concat_paragraphs(cleaned_paragraphs)

for idx, paragraph in enumerate(complete_paragraphs):
    print(idx, paragraph)
    print()

0 Chemoenzymatic Synthesis of Acyl Coenzyme A Substrates Enables in Situ Labeling of Small Molecules and Proteins

1 ABSTRACT: A chemoenzymatic approach to generate fully functional acyl coenzyme A molecules that are then used as substrates to drive in situ acyl transfer reactions is described. Mass spectrometry based assays to verify the identity of acyl coenzyme A enzymatic products are also illustrated. The approach is responsive to a diverse array of carboxylic acids that can be elaborated to their corresponding coenzyme A thioesters, with potential applications in wide-ranging chemical biology studies that utilize acyl coenzyme A substrates.

2 In biological chemistry, coenzyme A (CoA, 1, Scheme 1) actsas a molecular shuttle for carboxylic acids linked to itsterminal thiol. S-Acylated derivatives of 1 (acyl-CoAs, Scheme 1) participate in numerous enzymatic reactions, including primary energy metabolism, synthesis of biomolecules, post-translational modification of proteins, and ot