In [None]:
!pip install requests
!pip install bs4
!pip install opencv-python
!pip install opencv-contrib-python
!pip install matplotlib
!pip install pandas
!pip install tqdm
!pip install black 
!pip install flakes8 
!pip install isort
!pip install argparse

In [None]:
import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()

In [None]:
from typing import Union
from pathlib import Path

def create_file_url(subject_id: str, study_id: str, image_id: str) -> str:
    """
    Create a URL to download a file from PhysioNet.
    """
    return f"https://www.physionet.org//mimic-cxr-jpg/2.0.0/files/p{subject_id[0:2]}/p{subject_id}/s{study_id}/{image_id}.jpg"

def create_saved_path(root_dir: Union[str, Path], subject_id: str, study_id: str, image_id: str) -> Path:
    """
    Create a path to save a file from PhysioNet.
    """
    return Path(root_dir) / f"p{subject_id[0:2]}/p{subject_id}/s{study_id}/{image_id}.jpg"

create_file_url('10000032','50414267', '02aa804e-bde0afdd-112c0b34-7bc16630-4e384014')

In [None]:
import os
from typing import List, Tuple
from requests import Session

import cv2
import numpy as np

from tqdm import tqdm

root_dir = "./cxr_dataset/"

def transform_image_from_bytes(image: bytes) -> np.ndarray:
    """
    Transform an image from bytes to a NumPy array.
    """
    nparr = np.frombuffer(image, np.uint8)
    img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    height, width = img.shape[:2]

    # Calculate the new dimensions
    new_width = int(width / 4)
    new_height = int(height / 4)

    # Resize the image
    img = cv2.resize(img, (new_width, new_height), interpolation = cv2.INTER_AREA)
    return img

def download_images(session: Session,images_info: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]:
    """
    Download an image from PhysioNet.
    """
    failed_download = []
    
    i = 0

    for image_info in tqdm(images_info):
        subject_id, study_id, image_id = image_info
        url = create_file_url(subject_id, study_id, image_id)
        path = create_saved_path(root_dir, subject_id, study_id, image_id)

        os.makedirs(path.parent, exist_ok=True)

        logger.info(f"Start downloading {subject_id} - {study_id} - {image_id}\nFrom URL:{url}\nSaved to: {path}")

        response = session.get(url)
        if response.status_code == 200:
            logger.info(f"Downloaded successfully - Transforming image...")
            img = transform_image_from_bytes(response.content)
            cv2.imwrite(str(path), img)
            logger.info(f"Saved image to {path}")
        else:
            logger.warning(f"Failed to download {subject_id} - {study_id} - {image_id}.")
            failed_download.append(image_info)

        i += 1
        if i == 20:
            break


In [None]:
import pandas as pd
metadata = './mimic-cxr-2.0.0-metadata.csv'

def get_images_info(metadata_path: str):
    """
    Get a list of images info from the metadata file.
    """
    df = pd.read_csv(metadata_path, dtype=str)
    images_info = df[['subject_id', 'study_id', 'dicom_id']].values.tolist()
    return images_info

In [None]:
import requests
from bs4 import BeautifulSoup
import requests

import pandas as pd

# URL of the site to login
site_url = 'https://www.physionet.org/login/'
# Your credentials
username = 'phphuc'
password = '@ABCDE12345@'

# Create a session object
session = requests.Session()

# Get login CSRF token
response = session.get(site_url)
soup = BeautifulSoup(response.text, 'html.parser')
csrfmiddlewaretoken = soup.find('input', dict(name='csrfmiddlewaretoken'))['value']

headers = {
    'referer': 'https://www.physionet.org/login/',
}

# Login
payload = {'username': username, 'password': password, 'csrfmiddlewaretoken': csrfmiddlewaretoken, 'next': '/'}
login_response = session.post(site_url, data=payload, headers=headers)

# Check if login was successful
if login_response.status_code == 200:
    logger.debug("Login successful")

    imgs_info = get_images_info(metadata)
    failed_download = download_images(session, imgs_info)
    pd.DataFrame(data=failed_download, columns=['subject_id', 'study_id', 'dicom_id']).to_csv('failed_download.csv', index=False)
else:
    logger.warn("Login failed\n" + login_response.text)

In [None]:
len(get_images_info(metadata)) // 3