## Import and Data Load

In [None]:
import pandas as pd
import numpy as np
import json
import requests
import time

## BERT - Pretrained models

In [None]:
with_pmc = pd.read_excel("DatasetPositivo.xlsx")
with_pmc = with_pmc[with_pmc.Abstract != ""]
with_pmc

In [None]:
titles = with_pmc['Title'].tolist()
abstracts = with_pmc['Abstract'].tolist()

In [None]:
id_list = with_pmc['PMID'].values
#real_list = with_pmc['Target'].values

In [None]:
from unidecode import unidecode

def text_process(text, type_='list'):
    preproc = text.lower()
    preproc = unidecode(preproc)
    if type_=='list':
        preproc = preproc.split(',')
    return preproc

abstracts = [text_process(text, type_='notlist') for text in with_pmc.Abstract.tolist()]
with_pmc['Abstract_preproc'] = abstracts

In [None]:
import re

def compare_rows(text, textlist):
    compare = False
    if text in textlist or text+'s' in textlist:
        compare = True
    elif '(' in text:
        newt = re.sub(r'\([^)]*\)', '', text)
        if newt in textlist or newt+'s' in textlist:
            compare = True
    return compare

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer

base_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def hugging_model_process(model_name, tokenizer, text, offset=0):

    token = model_name if tokenizer else base_tokenizer
    ner_model = pipeline(task="ner", model=model_name, tokenizer=token)
    model_res = ner_model(text, aggregation_strategy="first")

    array_res = []
    for dic in model_res:
        annot_type = dic['entity_group']
        if annot_type.lower() in ['chemical','bio','b']:
            annot_text = dic['word']
            annot_loc = {'start': dic['start'] + offset, 'end': dic['end'] + offset}
            res = {'text': annot_text, 'location': annot_loc}
            array_res.append(res)
    return array_res

In [None]:
hf_models = ["raynardj/ner-chemical-bionlp-bc5cdr-pubmed",
             "alvaroalon2/biobert_chemical_ner",
             "aitslab/biobert_huner_chemical_v1",
             "pruas/BENT-PubMedBERT-NER-Chemical"]

#### Raynard

In [None]:
raylist = []
for i, text in enumerate(abstracts):
    offset = len(titles[i])
    raychemres = hugging_model_process(hf_models[0], True, text, offset)
    rayres = {'id': id_list[i], 'chemical': raychemres}
    raylist.append(rayres)

In [None]:
ray_list = []

for id in raylist:
    chemlist = []
    for chem in id['chemical']:
        chemlist.append(chem['text'].lower()[1:])
    ray_list.append(chemlist)

print(len(ray_list))

In [None]:
len(ray_list)

In [None]:
with_pmc['Raynardj'] = ray_list

#### AlonBioBert

In [None]:
alonlist = []
for i, text in enumerate(abstracts):
    offset = len(titles[i])
    alonchemres = hugging_model_process(hf_models[1], True, text, offset)
    alonres = {'id': id_list[i], 'chemical': alonchemres}
    alonlist.append(alonres)

In [None]:
alon_list = []

for id in alonlist:
    chemlist = []
    for chem in id['chemical']:
        chemlist.append(chem['text'].lower())
    alon_list.append(chemlist)

print(len(alon_list))

In [None]:
len(alon_list)

In [None]:
with_pmc['AlvaroAlon'] = alon_list

#### AitsLab

In [None]:
aitslist = []
for i, text in enumerate(abstracts):
    offset = len(titles[i])
    aitschemres = hugging_model_process(hf_models[2], False, text, offset)
    aitsres = {'id': id_list[i], 'chemical': aitschemres}
    aitslist.append(aitsres)

In [None]:
aits_list = []

for id in aitslist:
    chemlist = []
    for chem in id['chemical']:
        chemlist.append(chem['text'].lower())
    aits_list.append(chemlist)

print(len(aits_list))

In [None]:
with_pmc['AitsLab'] = aits_list

#### Pruas

In [None]:
ner_model = pipeline(task="ner", model=hf_models[3], tokenizer=base_tokenizer)
model_res = ner_model(abstracts[0], aggregation_strategy="first")
print(model_res)

In [None]:
pruaslist = []
for i, text in enumerate(abstracts):
    offset = len(titles[i])
    try:
        pruaschemres = hugging_model_process(hf_models[3], True, text, offset)
        pruasres = {'id': id_list[i], 'chemical': pruaschemres}
    except:
        pruasres = {'id': id_list[i], 'chemical': ''}
    pruaslist.append(pruasres)

In [None]:
pruaslist[0]

In [None]:
pruas_list = []

for id in pruaslist:
    chemlist = []
    for chem in id['chemical']:
        chemlist.append(chem['text'].lower())
    pruas_list.append(chemlist)

print(len(pruas_list))

In [None]:
with_pmc['Pruas'] = pruas_list

## Export

In [None]:
reduce = lambda l: np.unique(l)

with_pmc['Raynardj'] = with_pmc['Raynardj'].apply(reduce)
with_pmc['AlvaroAlon'] = with_pmc['AlvaroAlon'].apply(reduce)
with_pmc['AitsLab'] = with_pmc['AitsLab'].apply(reduce)
with_pmc['Pruas'] = with_pmc['Pruas'].apply(reduce)

In [None]:
with_pmc

In [None]:
def process_response(response):
    M = []
    if len(response) == 0:
        return np.array([])

    for elem in response:
        transf = re.sub(r",|\.",'',elem)
        transf = re.sub(r"'",'',transf)
        transf = re.sub(r"\s\-\s",'-',transf)
        transf = re.sub(r"\s?\(\s?",'(',transf)
        transf = re.sub(r"\s?\)\s?",')',transf)
        transf = re.sub(r"\s{2,}",' ',transf)
        M.append(transf)
    return np.unique(M)

In [None]:
with_pmc['Raynardj'] = with_pmc['Raynardj'].apply(process_response)
with_pmc['AlvaroAlon'] = with_pmc['AlvaroAlon'].apply(process_response)
with_pmc['AitsLab'] = with_pmc['AitsLab'].apply(process_response)
with_pmc['Pruas'] = with_pmc['Pruas'].apply(process_response)

In [None]:
all_res = [ np.concatenate((with_pmc['Raynardj'][i].astype(list),
                          with_pmc['AlvaroAlon'][i].astype(list),
                          with_pmc['AitsLab'][i].astype(list),
                          with_pmc['Pruas'][i].astype(list))) for i in list(with_pmc.index)]

one_res = [np.unique(res) for res in all_res]
with_pmc['Combined words 1/4'] = one_res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_pmc['Combined words 1/4'] = one_res


In [None]:
with_pmc

Unnamed: 0,PMC_ID,PMID,Title,Abstract,Target,Abstract_preproc,Raynardj,AlvaroAlon,AitsLab,Pruas,Combined words 1/4
1,4790266,27029317,Pipeline of Known Chemical Classes of Antibiot...,Many approaches are used to discover new antib...,"[fy-901, aminoglycoside]",many approaches are used to discover new antib...,"[lipopeptides, quinolones aminoglycosides macr...","[aminoglycosides, beta-lactams, lipoglycopepti...","[aminoglycosides, beta-lactams, macrolides, ox...","[aminoglycosides, beta, cyclic, lipoglycopepti...","[aminoglycosides, beta, beta-lactams, cyclic, ..."
31,3318344,22232283,In vitro potential of equine DEFA1 and eCATH1 ...,"Rhodococcus equi, the causal agent of rhodococ...","[ecath1, antimicrobial peptide]","rhodococcus equi, the causal agent of rhodococ...","[ecath1, rifampin]",[rifampin],[rifampin],[rifampin],"[ecath1, rifampin]"
34,4249419,25199778,Effective control of Salmonella infections by ...,We successfully produced two human β-defensins...,"[hbd-1, antimicrobial peptide]",we successfully produced two human b-defensins...,[],[],[],[],[]
35,4249419,25199778,Effective control of Salmonella infections by ...,We successfully produced two human β-defensins...,"[hbd-2, antimicrobial peptide]",we successfully produced two human b-defensins...,[],[],[],[],[]
36,4249444,25199777,In vitro potential of Lycosin-I as an alternat...,The resistance of multidrug-resistant Acinetob...,"[lycosin 1, antimicrobial peptide]",the resistance of multidrug-resistant acinetob...,"[ca(2+)), d-lycosin-i, l-, lycosin-i, salt]","[ca(2 +), l-and d-lycosin-i, lycosin-i, mg(2 +)]","[ca(2 +), l-and d-lycosin-i, lycosin-i, mg(2 +)]","[ca, l, lycosin, mg]","[ca, ca(2 +), ca(2+)), d-lycosin-i, l, l-, l-a..."
...,...,...,...,...,...,...,...,...,...,...,...
2716,3667366,23741637,Rhodomycin analogues from Streptomyces purpura...,During a screening program for bioactive natur...,"[rhodomycin, anthracycline]",during a screening program for bioactive natur...,"[a2-rhodomycin ii, aglycones, ethyl acetate, o...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[a2, ethyl, obelmycin, rhodomycin, sugars]","[a2, a2-rhodomycin ii, aglycones, ethyl, ethyl..."
2725,4102832,23931281,Salinomycin: a novel anti-cancer agent with kn...,"Salinomycin, traditionally used as an anti-coc...","[salinomycin, polyether antibiotic (ionophore)]","salinomycin, traditionally used as an anti-coc...",[salinomycin],[salinomycin],[salinomycin],[salinomycin],[salinomycin]
2746,10187937,37192172,Streptothricin F is a bactericidal antibiotic ...,The streptothricin natural product mixture (al...,"[streptothricin, streptothricin]",the streptothricin natural product mixture (al...,"[carbapenem-resistant, guanine, gulosamine, ly...","[-, - f, carbapenem, guanine, gulosamine, hydr...","[carbamoylated gulosamine, guanine, hydrogen, ...","[carbamoylated, carbapenem, guanine, hydrogen,...","[-, - f, carbamoylated, carbamoylated gulosami..."
2775,8068249,33917912,The Nonribosomal Peptide Valinomycin: From Dis...,Valinomycin is a nonribosomal peptide that was...,"[valinomycin, antimicrobial peptide (cyclodode...",valinomycin is a nonribosomal peptide that was...,[valinomycin],[valinomycin],[valinomycin],[valinomycin],[valinomycin]


In [None]:
all_res = [np.unique(res, return_counts=True) for res in all_res]
all_res

[(array(['aminoglycosides', 'beta', 'beta-lactams', 'cyclic',
         'lipoglycopeptides', 'lipopeptides', 'macrolides',
         'oxazolidinones', 'pleuromutilins', 'polymyxins', 'quinolones',
         'quinolones aminoglycosides macrolides oxazolidinones tetracyclines pleuromutilins beta-lactams lipoglycopeptides polymyxins',
         'r', 'tetracyclines'], dtype=object),
  array([3, 1, 2, 1, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3])),
 (array(['ecath1', 'rifampin'], dtype=object), array([1, 4])),
 (array([], dtype=object), array([], dtype=int64)),
 (array([], dtype=object), array([], dtype=int64)),
 (array(['ca', 'ca(2 +)', 'ca(2+))', 'd-lycosin-i', 'l', 'l-',
         'l-and d-lycosin-i', 'lycosin', 'lycosin-i', 'mg', 'mg(2 +)',
         'salt'], dtype=object),
  array([1, 2, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1])),
 (array(['calcium', 'carbohydrate', 'd', 'd-alanyl -', 'd-alanyl-l-alanine',
         'l-alanine', 'nacl'], dtype=object),
  array([4, 3, 1, 1, 2, 1, 4])),
 (array(['b', 'b-lactams', 'c'

In [None]:
with_pmc['Combined words 2/4'] = [all_res[i][0][all_res[i][1] >= 2] for i in range(len(all_res))]
with_pmc['Combined words 3/4'] = [all_res[i][0][all_res[i][1] >= 3] for i in range(len(all_res))]
with_pmc['Combined words 4/4'] = [all_res[i][0][all_res[i][1] == 4] for i in range(len(all_res))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_pmc['Combined words 2/4'] = [all_res[i][0][all_res[i][1] >= 2] for i in range(len(all_res))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_pmc['Combined words 3/4'] = [all_res[i][0][all_res[i][1] >= 3] for i in range(len(all_res))]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  with_pmc[

In [None]:
with_pmc

Unnamed: 0,PMC_ID,PMID,Title,Abstract,Target,Abstract_preproc,Raynardj,AlvaroAlon,AitsLab,Pruas,Combined words 1/4,Combined words 2/4,Combined words 3/4,Combined words 4/4
1,4790266,27029317,Pipeline of Known Chemical Classes of Antibiot...,Many approaches are used to discover new antib...,"[fy-901, aminoglycoside]",many approaches are used to discover new antib...,"[lipopeptides, quinolones aminoglycosides macr...","[aminoglycosides, beta-lactams, lipoglycopepti...","[aminoglycosides, beta-lactams, macrolides, ox...","[aminoglycosides, beta, cyclic, lipoglycopepti...","[aminoglycosides, beta, beta-lactams, cyclic, ...","[aminoglycosides, beta-lactams, lipoglycopepti...","[aminoglycosides, macrolides, oxazolidinones, ...",[]
31,3318344,22232283,In vitro potential of equine DEFA1 and eCATH1 ...,"Rhodococcus equi, the causal agent of rhodococ...","[ecath1, antimicrobial peptide]","rhodococcus equi, the causal agent of rhodococ...","[ecath1, rifampin]",[rifampin],[rifampin],[rifampin],"[ecath1, rifampin]",[rifampin],[rifampin],[rifampin]
34,4249419,25199778,Effective control of Salmonella infections by ...,We successfully produced two human β-defensins...,"[hbd-1, antimicrobial peptide]",we successfully produced two human b-defensins...,[],[],[],[],[],[],[],[]
35,4249419,25199778,Effective control of Salmonella infections by ...,We successfully produced two human β-defensins...,"[hbd-2, antimicrobial peptide]",we successfully produced two human b-defensins...,[],[],[],[],[],[],[],[]
36,4249444,25199777,In vitro potential of Lycosin-I as an alternat...,The resistance of multidrug-resistant Acinetob...,"[lycosin 1, antimicrobial peptide]",the resistance of multidrug-resistant acinetob...,"[ca(2+)), d-lycosin-i, l-, lycosin-i, salt]","[ca(2 +), l-and d-lycosin-i, lycosin-i, mg(2 +)]","[ca(2 +), l-and d-lycosin-i, lycosin-i, mg(2 +)]","[ca, l, lycosin, mg]","[ca, ca(2 +), ca(2+)), d-lycosin-i, l, l-, l-a...","[ca(2 +), l-and d-lycosin-i, lycosin-i, mg(2 +)]",[lycosin-i],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2716,3667366,23741637,Rhodomycin analogues from Streptomyces purpura...,During a screening program for bioactive natur...,"[rhodomycin, anthracycline]",during a screening program for bioactive natur...,"[a2-rhodomycin ii, aglycones, ethyl acetate, o...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[a2, ethyl, obelmycin, rhodomycin, sugars]","[a2, a2-rhodomycin ii, aglycones, ethyl, ethyl...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[a2-rhodomycin ii, ethyl acetate, obelmycin, r...","[obelmycin, rhodomycin, sugars]"
2725,4102832,23931281,Salinomycin: a novel anti-cancer agent with kn...,"Salinomycin, traditionally used as an anti-coc...","[salinomycin, polyether antibiotic (ionophore)]","salinomycin, traditionally used as an anti-coc...",[salinomycin],[salinomycin],[salinomycin],[salinomycin],[salinomycin],[salinomycin],[salinomycin],[salinomycin]
2746,10187937,37192172,Streptothricin F is a bactericidal antibiotic ...,The streptothricin natural product mixture (al...,"[streptothricin, streptothricin]",the streptothricin natural product mixture (al...,"[carbapenem-resistant, guanine, gulosamine, ly...","[-, - f, carbapenem, guanine, gulosamine, hydr...","[carbamoylated gulosamine, guanine, hydrogen, ...","[carbamoylated, carbapenem, guanine, hydrogen,...","[-, - f, carbamoylated, carbamoylated gulosami...","[carbapenem, guanine, gulosamine, hydrogen, ly...","[guanine, hydrogen, lysine, lysines, nourseoth...","[guanine, streptothricin]"
2775,8068249,33917912,The Nonribosomal Peptide Valinomycin: From Dis...,Valinomycin is a nonribosomal peptide that was...,"[valinomycin, antimicrobial peptide (cyclodode...",valinomycin is a nonribosomal peptide that was...,[valinomycin],[valinomycin],[valinomycin],[valinomycin],[valinomycin],[valinomycin],[valinomycin],[valinomycin]


In [None]:
with_pmc.to_excel('DatasetPositivo.xlsx', index=False)