# WebScraper for PubMed

## The following Code was run on GoogleCollab

In [10]:
# RUN THIS CELL FIRST
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import os

def scrape_pubmed(keyword, num_results, results_df):
    base_url = "https://pubmed.ncbi.nlm.nih.gov"
    search_url = f"{base_url}/?term={keyword}"

    response = requests.get(search_url)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    articles = soup.find_all("article", class_="full-docsum")
    articles = articles[:num_results]

    rows = []  # List to store article data

    for article in articles:
        title_elem = article.find("a", class_="docsum-title")
        title = title_elem.text.strip() if title_elem else "Title not found"

        authors_elem = article.find("span", class_="docsum-authors")
        authors = authors_elem.text.strip() if authors_elem else "Authors not found"

        journal_elem = article.find("span", class_="docsum-journal-citation")
        journal = journal_elem.text.strip() if journal_elem else "Journal not found"

        abstract_url_elem = article.find("a", class_="docsum-title")
        if abstract_url_elem:
            abstract_url = base_url + abstract_url_elem.get("href")
            abstract_response = requests.get(abstract_url)
            abstract_response.raise_for_status()
            abstract_soup = BeautifulSoup(abstract_response.content, 'html.parser')
            abstract_text_elem = abstract_soup.find("div", class_="abstract-content")
            abstract_text = abstract_text_elem.text.strip() if abstract_text_elem else "Abstract not found"
        else:
            abstract_text = "Abstract URL not found"

        rows.append({
            'Keyword': keyword,
            'Title': title,
            'Authors': authors,
            'Journal': journal,
            'Abstract': abstract_text
        })

    new_df = pd.concat([results_df, pd.DataFrame(rows)], ignore_index=True)
    return new_df

def curate_compounds(df):
    curated_rows = []

    for _, row in df.iterrows():
        abstract = row['Abstract']

        if (re.search(r'\b(human|humans|participants|females|mothers|babies|children|women|deliveries|infants|review)\b', abstract, re.IGNORECASE) and
            re.search(r'\b\d{3,}\b', abstract)):
            curated_rows.append(row)

        elif (re.search(r'\b(rat|rats|rabbit|rabbits|mouse|mice)\b', abstract, re.IGNORECASE) and
              re.search(r'\b\d+\s*(mg|g|µg)\b', abstract, re.IGNORECASE)):
            curated_rows.append(row)

        elif (re.search(r'\b(animal|animals|mouse|mice|rat|rabbit|rabbits)\b', abstract, re.IGNORECASE) and
              re.search(r'\b\d+\s*(mg|g|µg)\b', abstract, re.IGNORECASE)):
            curated_rows.append(row)

    curated_df = pd.DataFrame(curated_rows).reset_index(drop=True)
    return curated_df

# Main script
def main():
    # Update this path to match your local Excel file location
    input_file = 'keywords_example.xlsx'  # e.g., 'C:/Users/YourUser/Documents/keywords_trimester.xlsx'
    
    # Load input data
    df = pd.read_excel(input_file)

    num_results = 3
    results_df = pd.DataFrame(columns=['Keyword', 'Title', 'Authors', 'Journal', 'Abstract'])

    for count, keyword in enumerate(df['Compound_name'], start=1):
        print(f"Scraping results for keyword ({count}): {keyword}")
        results_df = scrape_pubmed(keyword, num_results, results_df)

        if count >= 600:
            print("Processed 600 compounds. Stopping the scraping process.")
            break

    # Ensure output directory exists
    output_dir = './Scrapper_results'
    os.makedirs(output_dir, exist_ok=True)

    # Save scraped results
    output_file = os.path.join(output_dir, 'Scraped_Articles_Trimester.xlsx')
    results_df.to_excel(output_file, index=False)
    print(f"Scraped data saved to {output_file}")

    # Curate and save curated results
    curated_df = curate_compounds(results_df)
    curated_output_file = os.path.join(output_dir, 'Curated_Compounds_Trimester.xlsx')
    curated_df.to_excel(curated_output_file, index=False)
    print(f"Curated data saved to {curated_output_file}")

if __name__ == "__main__":
    main()

Scraping results for keyword (1): captopril pregnancy trimester risk
Scraping results for keyword (2): Naloxone pregnancy trimester risk
Scraping results for keyword (3): Mifepristone pregnancy trimester risk
Scraped data saved to ./Scrapper_results/Scraped_Articles_Trimester.xlsx
Curated data saved to ./Scrapper_results/Curated_Compounds_Trimester.xlsx
