In [191]:
from Bio import Entrez, SeqIO
from pprint import pprint 
import re
Entrez.email = "lotmateohernandezespinosa@gmail.com"


In [8]:
def crear_termino(inpts):

    # Inicializamos la lista donde guardaremos los terminos, donde cada elementp corresponde a un organismo y sus genes correspondientes 
    terminos=[]

    # Hacemos un for para que el formateo se repita para cada organismo, los organismos esan separados por un ; por eso el split
    for inpt in inpts.split(';'):

        # Separamos los genes del organismo y metemos a este en la variable $termino
        inpt = inpt.split(":")
        termino = inpt[0] + "[Orgn] AND ("

        # Separamos cada gen para introducirlos de uno a uno en $termino con el formato adecuado
        i = True
        for gen in inpt[1].replace(" ","").split(','):
            if i:
                termino += gen+'[Gene]'
            else:
                termino += " OR " + gen+'[Gene]' 
            i = False

        termino += ")"

        # Metemos lo de termino al vector
        terminos += [termino]

    return(terminos)


# Se define la funcion para buscar en las bases de datos
def buscar_db(terminos):

    # Inicializamos la lista donde guardaremos los ids de las bases de datos segun el organismo, sera una lista de diccionarios de listas
    DbIDS = []

    # Hacemos este for para repetir todo por cada organismo
    for termino in terminos:

        # Buscamos los datos
        record = Entrez.read(Entrez.egquery(term = termino))

        # Inicializamos la lista que tendra las bases de datos en donde si hay informacion, Count > 1
        # Tambien el diccionario donde guardaremos la lista de IDs de cada base de datos que paso el criterio
        DictDbs = {}
        Dbs=[]

        # Checamos que bases de datos pasan nuestro criterio y las guardamos en $DBs
        for row in record["eGQueryResult"]:
            if (row["Count"] != "Error"):
                if(int(row["Count"]) > 1):
                    Dbs += [row["DbName"]]
        
        # Ahora para cada base de datos guardamos la lista con los IDS
        for Db in Dbs:
            DictDbs[Db] = Entrez.read(Entrez.esearch(db= Db, term=termino))["IdList"]
        
        # Este diccionario lo guardamos en la lista que iniciamos al inicio, esto para que se separen las bases de datos y IDS segun el organismo
        DbIDS+= [DictDbs]

    return(DbIDS)

In [144]:
terminos = crear_termino('Drosophila melanogaster: p53')
Dbs = buscar_db(terminos)

In [213]:
def articulos_tit(Dbs):
    Articulos = {}

    i = 0
    for Db in Dbs:

        Titus = []

        if ('pubmed' in list(Db.keys())):

            for ID in Db['pubmed']:
                Articulo = Entrez.read(Entrez.efetch(db='pubmed', id = ID))
                Titus += [Articulo['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']]
        
        if ('pmc' in list(Db.keys())):

            for ID in Db['pmc']:
                Articulo = Entrez.efetch(db = 'pmc', id = ID, rettype = 'medline', retype = 'text')
                Articulo = Articulo.read().split('\n')
                Articulo = list(filter(re.compile("TI").match, Articulo))

                Titus += [ Articulo[0].split('-')[1][1::] ]


        Articulos['Titulos'] = Titus
        Articulos['IDS'] = Db['pubmed']
    
    return(Articulos)

In [214]:
Articulos = articulos_tit(Dbs)

In [217]:
Articulos['Titulos']

['Regulation and coordination of the different DNA damage responses in <i>Drosophila</i>.',
 'p53: From Fundamental Biology to Clinical Applications in Cancer.',
 'Prediction of intestinal stem cell regulatory genes from Drosophila gut damage model created using multiple inducers: Differential gene expression-based protein-protein interaction network analysis.',
 'p53 Related Protein Kinase is Required for Arp2/3-Dependent Actin Dynamics of Hemocytes in <i>Drosophila melanogaster</i>.',
 'Effects of cadmium on oxidative stress and cell apoptosis in Drosophila melanogaster larvae.',
 'Enhanced germline stem cell longevity in Drosophila diapause.',
 '<i>Drosophila p53</i> isoforms have overlapping and distinct functions in germline genome integrity and oocyte quality control.',
 'NMNAT promotes glioma growth through regulating post-translational modifications of P53 to inhibit apoptosis.',
 'Toxic potential of botulinum toxin type A on senescence in a <i>Drosophila melanogaster</i> model

In [233]:
results = Entrez.read(Entrez.elink(dbfrom="pubmed", db="pubmed", id=Articulos['IDS'], LinkName='pubmed_pubmed_refs'))


In [226]:
results[0]['']

dict_keys(['LinkSetDbHistory', 'LinkSetDb', 'ERROR', 'DbFrom', 'IdList'])

In [280]:
results[19]['LinkSetDb'][0]['Link']

[{'Id': '32265270'},
 {'Id': '31952783'},
 {'Id': '31952115'},
 {'Id': '31697682'},
 {'Id': '31520351'},
 {'Id': '31308546'},
 {'Id': '30936562'},
 {'Id': '30737415'},
 {'Id': '30376068'},
 {'Id': '30087105'},
 {'Id': '30001204'},
 {'Id': '29420191'},
 {'Id': '29066472'},
 {'Id': '28459978'},
 {'Id': '28305413'},
 {'Id': '28305337'},
 {'Id': '27172211'},
 {'Id': '27075174'},
 {'Id': '26903656'},
 {'Id': '26497146'},
 {'Id': '24451596'},
 {'Id': '24439378'},
 {'Id': '23685284'},
 {'Id': '23071443'},
 {'Id': '22728672'},
 {'Id': '22036568'},
 {'Id': '21655087'},
 {'Id': '21653522'},
 {'Id': '21205799'},
 {'Id': '21203496'},
 {'Id': '21158756'},
 {'Id': '20543124'},
 {'Id': '20147375'},
 {'Id': '19936241'},
 {'Id': '19505943'},
 {'Id': '19451168'},
 {'Id': '19412177'},
 {'Id': '19279324'},
 {'Id': '19056894'},
 {'Id': '18439406'},
 {'Id': '18245850'},
 {'Id': '18202389'},
 {'Id': '17550304'},
 {'Id': '17498648'},
 {'Id': '17246043'},
 {'Id': '17246014'},
 {'Id': '17245914'},
 {'Id': '1654

In [279]:
ArtiRefs = []
i= 0
for result in results:
    Refs = []

    for link in result['LinkSetDb']:

        if len(link) > 0:

            for id in link['Link']:
                Refs += [id['Id']]


34824391
33855023
33154387
31864707
31425511
31391501
30904193
30892991
30824861
30735120
30462636
29028797
28622525
28306107
28218681
27584613
27332952
27075174
26679112
26573328
26324426
25959206
25941003
25924716
25312810
25211335
24683536
24675716
24618896
24556841
24551207
24380076
24349633
24286825
24284207
24240233
24214341
24003211
23184991
23144631
22666323
22467874
22413088
22294702
22056138
22036477
21965616
21886179
21693509
21439026
20965415
20659447
20491544
20368801
20308539
20189388
19847258
19692992
19364807
19100727
19095959
19056894
18940789
18845846
18779571
18719710
18688282
18571155
18550049
18548484
18437164
18410726
18284664
18213359
17380161
17310982
17183370
17141154
16920621
16860746
16785441
16340960
16340959
16198289
16079158
16020777
15879698
15860729
15723794
15510160
15296752
15221856
14729967
14711410
12935877
12917412
12766778
12717439
12672954
12558601
12459723
12399544
12172011
12093734
12050146
11961558
11752278
11728459
11511545
11459832
11298456
1