<a href="https://colab.research.google.com/github/maryasad/AI-Prompt-Engineering/blob/main/WebScraping_Knee_X_Ray.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install requests beautifulsoup4 pandas

import requests
from bs4 import BeautifulSoup
import pandas as pd

# PubMed search URL
BASE_URL = "https://pubmed.ncbi.nlm.nih.gov/"

def fetch_articles(query, start_year, end_year, max_results=10):
    articles = []
    page = 1

    while len(articles) < max_results:
        # Construct query URL
        url = f"{BASE_URL}?term={query}&filter=years.{start_year}-{end_year}&page={page}"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find article entries
        article_blocks = soup.find_all('article', class_='full-docsum')

        # Break loop if no more articles
        if not article_blocks:
            break

        for block in article_blocks:
            try:
                # Extract article title
                title = block.find('a', class_='docsum-title').text.strip()

                # Extract link to article
                link = BASE_URL + block.find('a', class_='docsum-title')['href']

                # Extract author details (optional)
                authors = block.find('span', class_='docsum-authors full-authors').text.strip()

                # Append article details
                articles.append({'Title': title, 'Authors': authors, 'Link': link})
            except Exception as e:
                print(f"Error parsing article: {e}")

            # Stop if max results reached
            if len(articles) >= max_results:
                break

        page += 1  # Move to next page

    return articles

# Query articles about knee-fracture detection in X-rays (2010-2024)
query = "knee fracture detection X-ray"
start_year = 2021
end_year = 2025
max_results = 10  # Set desired limit of articles

articles = fetch_articles(query, start_year, end_year, max_results)

# Save results to a DataFrame
df = pd.DataFrame(articles)

# Save as CSV for offline use
df.to_csv("knee_fracture_articles.csv", index=False)

# Display DataFrame
print(df)



                                               Title  \
0  Improving Radiographic Fracture Recognition Pe...   
1  Artificial intelligence for detection of effus...   
2  Comparison of diagnostic accuracy of point-of-...   
3  The Feature Ambiguity Mitigate Operator model ...   
4             [Knee cartilage injuries in athletes].   
5  Osteoporosis diagnosis in knee X-rays by trans...   
6                       A Young Skier with Leg Pain.   
7  Incidence and characteristics of ligamentous k...   
8  The anteromedial retinaculum in ACL-injured kn...   
9  Artificial intelligence versus radiologist in ...   

                                             Authors  \
0  Guermazi A, Tannoury C, Kompel AJ, Murakami AM...   
1  Cohen I, Sorin V, Lekach R, Raskin D, Segev M,...   
2  Kozaci N, Avci M, Yuksel S, Donertas E, Karaca...   
3  Wu HZ, Yan LF, Liu XQ, Yu YZ, Geng ZJ, Wu WJ, ...   
4                                           Horng A.   
5                                  Wani IM, Aro