In [316]:
from Bio import Entrez, SeqIO
from pprint import pprint 
import re
Entrez.email = "lotmateohernandezespinosa@gmail.com"


In [317]:
def crear_termino(inpts):

    # Inicializamos la lista donde guardaremos los terminos, donde cada elementp corresponde a un organismo y sus genes correspondientes 
    terminos=[]

    # Hacemos un for para que el formateo se repita para cada organismo, los organismos esan separados por un ; por eso el split
    for inpt in inpts.split(';'):

        # Separamos los genes del organismo y metemos a este en la variable $termino
        inpt = inpt.split(":")
        termino = inpt[0] + "[Orgn] AND ("

        # Separamos cada gen para introducirlos de uno a uno en $termino con el formato adecuado
        i = True
        for gen in inpt[1].replace(" ","").split(','):
            if i:
                termino += gen+'[Gene]'
            else:
                termino += " OR " + gen+'[Gene]' 
            i = False

        termino += ")"

        # Metemos lo de termino al vector
        terminos += [termino]

    return(terminos)


# Se define la funcion para buscar en las bases de datos
def buscar_db(terminos):

    # Inicializamos la lista donde guardaremos los ids de las bases de datos segun el organismo, sera una lista de diccionarios de listas
    DbIDS = []

    # Hacemos este for para repetir todo por cada organismo
    for termino in terminos:

        # Buscamos los datos
        record = Entrez.read(Entrez.egquery(term = termino))

        # Inicializamos la lista que tendra las bases de datos en donde si hay informacion, Count > 1
        # Tambien el diccionario donde guardaremos la lista de IDs de cada base de datos que paso el criterio
        DictDbs = {}
        Dbs=[]

        # Checamos que bases de datos pasan nuestro criterio y las guardamos en $DBs
        for row in record["eGQueryResult"]:
            if (row["Count"] != "Error"):
                if(int(row["Count"]) > 1):
                    Dbs += [row["DbName"]]
        
        # Ahora para cada base de datos guardamos la lista con los IDS
        for Db in Dbs:
            DictDbs[Db] = Entrez.read(Entrez.esearch(db= Db, term=termino))["IdList"]
        
        # Este diccionario lo guardamos en la lista que iniciamos al inicio, esto para que se separen las bases de datos y IDS segun el organismo
        DbIDS+= [DictDbs]

    return(DbIDS)

In [318]:
terminos = crear_termino('Drosophila melanogaster: p53')
Dbs = buscar_db(terminos)

In [332]:
def articulos_tit(Dbs):
    Articulos = {}

    i = 0
    for Db in Dbs:

        Titus = []

        if ('pubmed' in list(Db.keys())):

            for ID in Db['pubmed']:
                Articulo = Entrez.read(Entrez.efetch(db='pubmed', id = ID))
                Titus += [Articulo['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']]

        Articulos['Titulos'] = Titus
        Articulos['IDS'] = Db['pubmed']
    
    return(Articulos)

In [333]:
Articulos = articulos_tit(Dbs)

In [334]:
Articulos['Titulos']

['Regulation and coordination of the different DNA damage responses in <i>Drosophila</i>.',
 'p53: From Fundamental Biology to Clinical Applications in Cancer.',
 'Prediction of intestinal stem cell regulatory genes from Drosophila gut damage model created using multiple inducers: Differential gene expression-based protein-protein interaction network analysis.',
 'p53 Related Protein Kinase is Required for Arp2/3-Dependent Actin Dynamics of Hemocytes in <i>Drosophila melanogaster</i>.',
 'Effects of cadmium on oxidative stress and cell apoptosis in Drosophila melanogaster larvae.',
 'Enhanced germline stem cell longevity in Drosophila diapause.',
 '<i>Drosophila p53</i> isoforms have overlapping and distinct functions in germline genome integrity and oocyte quality control.',
 'NMNAT promotes glioma growth through regulating post-translational modifications of P53 to inhibit apoptosis.',
 'Toxic potential of botulinum toxin type A on senescence in a <i>Drosophila melanogaster</i> model

In [335]:
results = Entrez.read(Entrez.elink(dbfrom="pubmed", db="pubmed", id=Articulos['IDS'], LinkName='pubmed_pubmed_refs'))

In [336]:
results[0]

{'LinkSetDbHistory': [], 'LinkSetDb': [{'Link': [{'Id': '34824391'}, {'Id': '33855023'}, {'Id': '33154387'}, {'Id': '31864707'}, {'Id': '31425511'}, {'Id': '31391501'}, {'Id': '30904193'}, {'Id': '30892991'}, {'Id': '30824861'}, {'Id': '30735120'}, {'Id': '30462636'}, {'Id': '29028797'}, {'Id': '28622525'}, {'Id': '28306107'}, {'Id': '28218681'}, {'Id': '27584613'}, {'Id': '27332952'}, {'Id': '27075174'}, {'Id': '26679112'}, {'Id': '26573328'}, {'Id': '26324426'}, {'Id': '25959206'}, {'Id': '25941003'}, {'Id': '25924716'}, {'Id': '25312810'}, {'Id': '25211335'}, {'Id': '24683536'}, {'Id': '24675716'}, {'Id': '24618896'}, {'Id': '24556841'}, {'Id': '24551207'}, {'Id': '24380076'}, {'Id': '24349633'}, {'Id': '24286825'}, {'Id': '24284207'}, {'Id': '24240233'}, {'Id': '24214341'}, {'Id': '24003211'}, {'Id': '23184991'}, {'Id': '23144631'}, {'Id': '22666323'}, {'Id': '22467874'}, {'Id': '22413088'}, {'Id': '22294702'}, {'Id': '22056138'}, {'Id': '22036477'}, {'Id': '21965616'}, {'Id': '218

In [341]:
results[15]['LinkSetDb'][0]['Link']

IndexError: list index out of range

In [380]:
ArticuloRefs = []
i= 0
for result in results:
    TituRefs = {}
    Refs = []

    for link in result['LinkSetDb']:

        if len(link) > 0:

            for id in link['Link']:
                TituAbstraRef = {}

                try:
                    Articulo = Entrez.read(Entrez.efetch(db='pubmed', id = id['Id'], api_key = '2c4f67def001c89be6a3d681c1da87fc8409'))

                    TituAbstraRef['Id'] = id['Id']

                    if 'abstract' in list(Articulo['PubmedArticle'][0]['MedlineCitation']['Article'].keys()):
                        TituAbstraRef['Abstract'] = Articulo['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]
                    else:
                        TituAbstraRef['Abstract'] = 'No abstract'
                        
                    TituAbstraRef['Titulo'] = Articulo['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']

                    Refs += [TituAbstraRef]
                except:
                    print(id)

    
    TituRefs['Titulo'] = Articulos['Titulos'][i]
    TituRefs['Referencias'] = Refs
    ArticuloRefs += [TituRefs]
    
    i+=1


{'Id': '22056138'}
{'Id': '29755668'}
{'Id': '26980051'}
{'Id': '20516128'}
{'Id': '19682467'}
{'Id': '12034830'}
{'Id': '18707322'}
{'Id': '17600215'}
{'Id': '22898807'}
{'Id': '10481009'}
{'Id': '23385724'}
{'Id': '25882045'}
{'Id': '25268165'}
{'Id': '24876915'}
{'Id': '29284660'}
{'Id': '19562034'}
{'Id': '22036477'}


In [379]:
Articulo = Entrez.read(Entrez.efetch(db='pubmed', id = 24349633))

In [369]:

'abstract' in list(Articulo['PubmedArticle'][0]['MedlineCitation']['Article'].keys())

False