# FastMRI Knee Dataset

### Input:  
Combination of [FastMRI](https://github.com/facebookresearch/fastMRI) dataset and [FastMRI-plus](https://github.com/microsoft/fastmri-plus) annotations

### Output:
DICOM annotated dataset (.dcm) \
*(Note: only files with annotations from fastmri-plus are inlcuded)*

### Steps:
1. Download raw data (.tar.xz)
2. Extract raw data (.h5)
3. Create annotated dataset (.h5)
4. Convert to dicom (.dcm)

### 1. Download raw data
- input: N/A
- output: .tar.xz & .csv

In [None]:
import os

#download annotations from microsoft/fastmri-plus GitHub repository
os.makedirs('fastmri-plus-knee-annotations', exist_ok=True)
!curl -C - "https://github.com/microsoft/fastmri-plus/blob/main/Annotations/knee.csv" --output fastmri-plus-knee-annotations/knee.csv
!curl -C - "https://github.com/microsoft/fastmri-plus/blob/main/Annotations/knee_file_list.csv" --output fastmri-plus-knee-annotations/knee_file_list.csv

'''
WARNING! Downloading the entire dataset will take a long time and use a lot of disk space (~100GB).
Test with val (~20GB) first through entire pipeline before testing with train (~100GB). 
Test_v2 does not have any annotated matches.
'''
os.makedirs('raw_data/', exist_ok=True)
# !curl -C - "https://fastmri-dataset.s3.amazonaws.com/v2.0/knee_singlecoil_train.tar.xz?AWSAccessKeyId=AKIAJM2LEZ67Y2JL3KRA&Signature=kgXO24Kg5BeB8Pil7wAzJ5rP0K8%3D&Expires=1748086931" --output raw_data/knee_singlecoil_train.tar.xz
!curl -C - "https://fastmri-dataset.s3.amazonaws.com/v2.0/knee_singlecoil_val.tar.xz?AWSAccessKeyId=AKIAJM2LEZ67Y2JL3KRA&Signature=3ezP3wU5OrRxiLf5g%2FmS1ifnHno%3D&Expires=1748086931" --output raw_data/knee_singlecoil_val.tar.xz


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1564k    0 1564k    0     0   536k      0 --:--:--  0:00:02 --:--:--  536k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  563k    0  563k    0     0   231k      0 --:--:--  0:00:02 --:--:--  231k


### 2. Extract raw data
- input: .tar.xz
- output: .h5

In [None]:
import tarfile
import os

def extract_archive(archive_path: str, dest_dir: str):
    os.makedirs(dest_dir, exist_ok=True)
    with tarfile.open(archive_path, 'r:*') as tar:  # auto‑detect xz, gz, bz2 :contentReference[oaicite:0]{index=0}
        members = tar.getmembers()
        print('Extracting', len(members), 'files')
        tar.extractall(path=dest_dir)
                
archive_path = 'raw_data/knee_singlecoil_val.tar.xz'
raw_h5_path = 'raw_data/'
extract_archive(archive_path, raw_h5_path) #tarfile extraction creates the train/test/val folders automatically

Extracting 200 files


  tar.extractall(path=dest_dir)


### 3. Create annotated dataset
- input: .h5
- output: .h5

In [2]:
import numpy as np
import h5py
import pandas as pd
import numpy as np


str_dt = h5py.string_dtype(encoding='utf-8') 

ann_dtype = np.dtype([
    ('slice',  np.int32),
    ('x',      np.int32),
    ('y',      np.int32),
    ('width',  np.int32),
    ('height', np.int32),
    ('label',  str_dt),
    ('study_level',  str_dt),
])

fillna_values = {
    'slice': -1,
    'x': -1,
    'y': -1,
    'width': -1,
    'height': -1,
    'label': '',
    'study_level': ''
}

ann_df   = pd.read_csv('fastmri-plus-knee-annotations/knee.csv')
ann_df = ann_df.fillna(fillna_values)
filelist = pd.read_csv('fastmri-plus-knee-annotations/knee_file_list.csv', header=None, names=['Filename'])
groups   = ann_df.groupby('file')

raw_data_path = 'raw_data/singlecoil_val'        # directory of extracted .h5 files
annotated_data_path = 'annotated/val'                  # directory to write annotated files
os.makedirs(annotated_data_path, exist_ok=True)

for fname in filelist['Filename']:
    src_path = os.path.join(raw_data_path, fname+'.h5')
    dst_path = os.path.join(annotated_data_path, fname+'.h5')

    if not os.path.exists(src_path):
        print(f"Warning: {src_path} not found, skipping.")
        continue

    with h5py.File(src_path, 'r') as f_in, h5py.File(dst_path, 'w') as f_out:

        for dset_name in ['kspace', 'reconstruction_esc']:
            if dset_name in f_in:
                f_in.copy(dset_name, f_out)                  

        rows = []
        if fname in groups.groups:
            sub = groups.get_group(fname)
            for _, r in sub.iterrows():
                rows.append((
                    int(r['slice']),
                    int(r['x']),
                    int(r['y']),
                    int(r['width']),
                    int(r['height']),
                    str(r['label']),
                    str(r['study_level']),
                ))
        ann_arr = np.array(rows, dtype=ann_dtype)              

        f_out.create_dataset(
            'annotations',
            data=ann_arr,
            dtype=ann_dtype,
            compression='gzip'   # optional: compress metadata
        )



### 4. Convert to dicom (.dcm)
- input: .h5
- output: .dcm

*[reference](https://github.com/microsoft/fastmri-plus/blob/main/ExampleScripts/fastmri_to_dicom.py)

In [5]:
import datetime
from pathlib import Path
import h5py
import numpy as np
import pydicom
from pydicom.dataset import Dataset, FileMetaDataset
from pydicom.uid import generate_uid, ExplicitVRLittleEndian

# Private tag definitions
PRIVATE_CREATOR_TAG = (0x0011, 0x0010)    # Group 0x0011, element 0x0010
ANNOTATION_TAG      = (0x0011, 0x1010)    # Group 0x0011, element 0x1010


def convert_h5_to_dicom_with_annotations(
    input_h5_path: Path,
    output_dir: Path,
    reconstruction_name: str = 'reconstruction_esc',
    flip_up_down: bool = False,
    flip_left_right: bool = False
):
    """Convert a single annotated FastMRI .h5 into a folder of DICOM slices."""
    stem = input_h5_path.stem
    out_folder = output_dir / stem
    out_folder.mkdir(parents=True, exist_ok=True)

    # --- Load data ---
    with h5py.File(input_h5_path, 'r') as f:
        if reconstruction_name not in f:
            raise ValueError(f"{input_h5_path.name} has no '{reconstruction_name}' dataset")
        recon = f[reconstruction_name][:]   # shape: (slices, H, W)
        ann_ds = f['annotations'][:] if 'annotations' in f else None
        # return #debug

    # Optional flips
    if flip_up_down:
        recon = recon[:, ::-1, :]
    if flip_left_right:
        recon = recon[:, :, ::-1]

    # Scale to int16
    scale = 1024.0 / np.percentile(recon, 99.9)
    recon_scaled = np.clip(recon * scale, 0, 1024).astype(np.int16)
    num_slices = recon_scaled.shape[0]

    # Build slice→list_of_annotations mapping
    slice_annotations = {}
    if ann_ds is not None:
        for rec in ann_ds:
            sl = int(rec['slice'])
            ann = {
                'x': int(rec['x']),
                'y': int(rec['y']),
                'width':  int(rec['width']),
                'height': int(rec['height']),
                'label': rec['label'].decode() if isinstance(rec['label'], bytes) else rec['label'],
                'study_level': rec['study_level'].decode() if isinstance(rec['study_level'], bytes) else rec['study_level'],
            }
            slice_annotations.setdefault(sl, []).append(ann)

    study_uid  = generate_uid()
    series_uid = generate_uid()

    # --- Write one DICOM per slice ---
    for s in range(num_slices):
        pix = recon_scaled[s]
        # File meta
        fm = FileMetaDataset()
        fm.MediaStorageSOPClassUID      = pydicom.uid.MRImageStorage
        fm.MediaStorageSOPInstanceUID   = generate_uid()
        fm.TransferSyntaxUID            = ExplicitVRLittleEndian
        fm.ImplementationClassUID       = generate_uid()

        # Main dataset
        ds = Dataset()
        now = datetime.datetime.now()
        ds.ContentDate     = now.strftime('%Y%m%d')
        ds.ContentTime     = now.strftime('%H%M%S.%f')
        ds.SOPClassUID     = fm.MediaStorageSOPClassUID
        ds.SOPInstanceUID  = fm.MediaStorageSOPInstanceUID
        ds.Modality        = 'MR'
        ds.PatientName     = stem
        ds.PatientID       = stem
        ds.StudyInstanceUID= study_uid
        ds.SeriesInstanceUID = series_uid
        ds.InstanceNumber  = str(s + 1)
        ds.Rows            = pix.shape[0]
        ds.Columns         = pix.shape[1]
        ds.PhotometricInterpretation = 'MONOCHROME2'
        ds.SamplesPerPixel = 1
        ds.BitsAllocated   = 16
        ds.BitsStored      = 12
        ds.HighBit         = 11
        ds.PixelRepresentation = 1
        ds.WindowCenter    = str(512)
        ds.WindowWidth     = str(1024)
        ds.PixelData       = pix.tobytes()

        # --- Embed annotation info ---
        anns = slice_annotations.get(s, [])
        ds.SeriesDescription = f"{len(anns)} annotations" if anns else "No annotations"

        if anns:
            # 1) Set the private creator
            ds.add_new(PRIVATE_CREATOR_TAG, 'LO', 'FastMRI_Anno')

            # 2) Add a human-readable summary under a private tag
            labels = [a['label'] for a in anns]
            ds.add_new(ANNOTATION_TAG, 'LT', '\\'.join(labels))

        ds.file_meta       = fm
        ds.is_little_endian= True
        ds.is_implicit_VR  = False

        out_path = out_folder / f"{stem}_{s:03}.dcm"
        ds.save_as(str(out_path), write_like_original=False)


annotated_dir = Path(annotated_data_path) 
output_dir = Path('dicom/val')
output_dir.mkdir(exist_ok=True, parents=True)

for h5file in sorted(annotated_dir.glob("*.h5")):
    print(f"→ Converting {h5file.name}")
    try:
        convert_h5_to_dicom_with_annotations(input_h5_path=h5file, output_dir=output_dir)
    except Exception as e:
        print(f"[ERROR] {h5file.name}: {e}")

→ Converting file1000000.h5
→ Converting file1000007.h5
→ Converting file1000017.h5
→ Converting file1000026.h5
→ Converting file1000031.h5
→ Converting file1000033.h5
→ Converting file1000041.h5
→ Converting file1000052.h5
→ Converting file1000071.h5
→ Converting file1000073.h5
→ Converting file1000107.h5
→ Converting file1000108.h5
→ Converting file1000114.h5
→ Converting file1000126.h5
→ Converting file1000153.h5
→ Converting file1000178.h5
→ Converting file1000182.h5
→ Converting file1000190.h5
→ Converting file1000196.h5
→ Converting file1000201.h5
→ Converting file1000206.h5
→ Converting file1000229.h5
→ Converting file1000243.h5
→ Converting file1000247.h5
→ Converting file1000254.h5
→ Converting file1000263.h5
→ Converting file1000264.h5
→ Converting file1000267.h5
→ Converting file1000273.h5
→ Converting file1000277.h5
→ Converting file1000280.h5
→ Converting file1000283.h5
→ Converting file1000291.h5
→ Converting file1000292.h5
→ Converting file1000308.h5
→ Converting file100