In [1]:
%pip install biopython


Collecting biopython
  Obtaining dependency information for biopython from https://files.pythonhosted.org/packages/b8/c8/18570a63e8b6fe99a856698d500fb326f0ce247d2b70d8d631d4abcb5c9c/biopython-1.83-cp311-cp311-win_amd64.whl.metadata
  Downloading biopython-1.83-cp311-cp311-win_amd64.whl.metadata (13 kB)
Downloading biopython-1.83-cp311-cp311-win_amd64.whl (2.7 MB)
   ---------------------------------------- 0.0/2.7 MB ? eta -:--:--
   -- ------------------------------------- 0.1/2.7 MB 2.8 MB/s eta 0:00:01
   --------- ------------------------------ 0.7/2.7 MB 6.9 MB/s eta 0:00:01
   --------------- ------------------------ 1.0/2.7 MB 7.3 MB/s eta 0:00:01
   --------------- ------------------------ 1.0/2.7 MB 7.3 MB/s eta 0:00:01
   --------------- ------------------------ 1.1/2.7 MB 4.5 MB/s eta 0:00:01
   ---------------------- ----------------- 1.5/2.7 MB 5.6 MB/s eta 0:00:01
   ---------------------------- ----------- 1.9/2.7 MB 6.2 MB/s eta 0:00:01
   ------------------------------


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from Bio import Entrez
import pandas as pd
import time

# Asegúrate de proporcionar tu correo electrónico real aquí para usar la API de Entrez
Entrez.email = 'nicolas.buitrago.roldan@gmail.com' 
Entrez.api_key = '' # Si tienes una API Key de NCBI, introdúcela aquí
search_string_probiotics='(((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))'

# La lista de probiotics
probiotics = ['Lactobacillus plantarum', 'Lactobacillus paracasei', 'Lactobacillus acidophilus', 'Lactobacillus casei', 
              'Lactobacillus rhamnosus', 'Lactobacillus crispatus', 'Lactobacillus gasseri', 'Lactobacillus reuteri', 
              'Lactobacillus bulgaricus', 'Propionibacterium jensenii', 'Propionibacterium freudenreichii', 
              'Peptostreptococcus productus', 'Bacillus coagulans', 'Bacillus subtilis', 'Bacillus laterosporus', 
              'Lactococcus lactis', 'Lactococcus reuteri', 'Lactococcus rhamnosus', 'Lactococcus casei', 
              'Lactococcus acidophilus', 'Lactococcus curvatus', 'Lactococcus plantarum', 'Enterococcus faecium', 
              'Pediococcus acidilactici', 'Pediococcus pentosaceus', 'Streptococcus sanguis', 'Streptococcus oralis', 
              'Streptococcus mitis', 'Streptococcus thermophilus', 'Streptococcus salivarius', 'Bifidobacterium longum', 
              'Bifidobacterium catenulatum', 'Bifidobacterium breve', 'Bifidobacterium animalis', 'Bifidobacterium bifidum', 
              'Bacteroides uniformis', 'Akkermansia muciniphila', 'Saccharomyces boulardii'] 

def search_pubmed(probiotic, search_string):
    query = f'(((({probiotic}) AND {search_string}'
    print(query)
    handle = Entrez.esearch(db='pubmed', term=query, retmax=50000, api_key=Entrez.api_key)
    result = Entrez.read(handle)
    handle.close()
    # Pausa entre peticiones para no sobrecargar el servidor
    time.sleep(1)
    return result["IdList"]

def fetch_details(id_list):
    ids = ','.join(id_list)
    handle = Entrez.efetch(db='pubmed', id=ids, retmode='xml')
    results = Entrez.read(handle)
    handle.close()
    return results

def extract_paper_info(paper):
    paper_info = {}
    article = paper['MedlineCitation']['Article']
    authors = article.get('AuthorList', [])
    authors_names = []
    for author in authors:
        lastname = author.get('LastName', '')
        forename = author.get('ForeName', '')
        initials = author.get('Initials', '')
        authors_names.append(f'{lastname} {forename} {initials}'.strip())
    paper_info['Title'] = article.get('ArticleTitle', '')
    paper_info['Authors'] = ', '.join(authors_names)
    paper_info['Journal'] = article.get('Journal', {}).get('ISOAbbreviation', '')
    paper_info['Year'] = article.get('Journal', {}).get('JournalIssue', {}).get('PubDate', {}).get('Year', '')
    paper_info['PMID'] = str(paper['MedlineCitation']['PMID'])
    abstract = article.get('Abstract', {}).get('AbstractText', [])
    paper_info['Abstract'] = ' '.join(abstract) if abstract else ''
    return paper_info

# Combina las palabras clave en una sola cadena de búsqueda
# OR {regulation_keywords}

# Buscar cada gen y recopilar la información del artículo
all_articles_info = []
for probiotic in probiotics:
    pmids = search_pubmed(probiotic,search_string_probiotics)
    if pmids:
        papers = fetch_details(pmids)
        for paper in papers['PubmedArticle']:
            paper_info = extract_paper_info(paper)
            paper_info['probiotic'] = probiotic
            all_articles_info.append(paper_info)

# Crear DataFrame de Pandas y guardar en Excel
df = pd.DataFrame(all_articles_info)
df.to_excel('pubmed_articles_probiotics.xlsx', index=False)

((((Lactobacillus plantarum) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus paracasei) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus acidophilus) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus casei) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus rhamnosus) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus crispatus) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus gasseri) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus reuteri) AND (((Caco-2) OR (HT-29)))) AND (cancer)) AND (colon)) AND (((RNA-seq) OR (transcriptome)))
((((Lactobacillus bulgaricus) 