In [1]:
from pathlib import Path
import torch
import pandas as pd
import stat
import numbers
import pydicom
import numpy as np
from tqdm import tqdm
from collections import Counter
import re
import os
from PIL import Image
from matplotlib import pyplot as plt
import cv2

In [2]:
Path().resolve()

PosixPath('/home/buehlern/Documents/Masterarbeit/notebooks')

# Create Balanced Dataset for Fracture Detection Finetuning

In [3]:
df_loc = Path('../data/df_min.pkl')
df = pd.read_pickle(df_loc)

In [4]:
df.columns

Index(['patientid', 'path', 'pixelarr_shape', 'inverted', 'bodypart',
       'dcm_BodyPartExamined', 'fracture', 'foreignmaterial'],
      dtype='object')

In [5]:
df['fracture'].value_counts()

fracture
NO        441
YES       212
Unsure     20
Name: count, dtype: int64

In [6]:
bool_map = {'YES': True, 'NO': False, 'Unsure': float('NaN')}
df['fracture_bool'] = df['fracture'].map(bool_map)
df['foreignmaterial_bool'] = df['foreignmaterial'].map(bool_map)

In [7]:
df_frac = df.dropna(subset=['fracture_bool'])

In [8]:
df_frac['fracture'].value_counts()

fracture
NO     441
YES    212
Name: count, dtype: int64

In [9]:
df_frac['bodypart'].value_counts()

bodypart
KNIE_NEU          66
ELLENBOGEN_NEU    64
FUSS_NEU          62
HAND_NEU          57
HG_NEU            57
HWS_NEU           57
SCHULTER_NEU      57
BWS_NEU           53
CLAVICULA_NEU     51
DX_RIPPEN         48
SCAPULA_NEU       45
DX_Schädel_Neu    36
Name: count, dtype: int64

In [10]:
df_frac.groupby(['bodypart', 'fracture']).size()

bodypart        fracture
BWS_NEU         NO          28
                YES         25
CLAVICULA_NEU   NO          20
                YES         31
DX_RIPPEN       NO          26
                YES         22
DX_Schädel_Neu  NO          36
ELLENBOGEN_NEU  NO          30
                YES         34
FUSS_NEU        NO          45
                YES         17
HAND_NEU        NO          43
                YES         14
HG_NEU          NO          27
                YES         30
HWS_NEU         NO          54
                YES          3
KNIE_NEU        NO          61
                YES          5
SCAPULA_NEU     NO          30
                YES         15
SCHULTER_NEU    NO          41
                YES         16
dtype: int64

In [11]:
df_frac.groupby(['bodypart', 'fracture']).size().unstack().fillna(0).min(axis=1).sort_values(ascending=False)

bodypart
ELLENBOGEN_NEU    30.0
HG_NEU            27.0
BWS_NEU           25.0
DX_RIPPEN         22.0
CLAVICULA_NEU     20.0
FUSS_NEU          17.0
SCHULTER_NEU      16.0
SCAPULA_NEU       15.0
HAND_NEU          14.0
KNIE_NEU           5.0
HWS_NEU            3.0
DX_Schädel_Neu     0.0
dtype: float64

In [12]:
df_balanced = pd.DataFrame()
bp_list = ['ELLENBOGEN_NEU', 'HG_NEU']
for bp in bp_list:
    df_bp = df_frac[df_frac['bodypart'] == bp]
    num_samples = df_bp['fracture'].value_counts().min()
    df_bp = df_bp.groupby('fracture_bool').apply(lambda x: x.sample(n=num_samples, random_state=1))
    df_balanced = pd.concat([df_balanced, df_bp])
df_balanced = df_balanced.reset_index(drop=True)

  df_bp = df_bp.groupby('fracture_bool').apply(lambda x: x.sample(n=num_samples, random_state=1))
  df_bp = df_bp.groupby('fracture_bool').apply(lambda x: x.sample(n=num_samples, random_state=1))


In [13]:
df_balanced[['bodypart', 'fracture_bool']]

Unnamed: 0,bodypart,fracture_bool
0,ELLENBOGEN_NEU,False
1,ELLENBOGEN_NEU,False
2,ELLENBOGEN_NEU,False
3,ELLENBOGEN_NEU,False
4,ELLENBOGEN_NEU,False
...,...,...
109,HG_NEU,True
110,HG_NEU,True
111,HG_NEU,True
112,HG_NEU,True


In [14]:
df_balanced.groupby(['bodypart', 'fracture']).size()

bodypart        fracture
ELLENBOGEN_NEU  NO          30
                YES         30
HG_NEU          NO          27
                YES         27
dtype: int64

In [15]:
df_ft_balanced_loc = Path('../data/df_min_ft_test_114.pkl')
pd.to_pickle(df_balanced, df_ft_balanced_loc)

# Inspect Data

In [16]:
df_ft_balanced_loc = Path('../data/df_min_ft_test_114.pkl')
df_balanced = pd.read_pickle(df_ft_balanced_loc)

In [17]:
def show_image(image, title=''):
    # image is [H, W, 1]
    assert image.shape[2] == 1
    plt.imshow(image, cmap=plt.cm.bone)
    plt.title(title, fontsize=8)
    plt.axis('off')
    return

In [None]:
bp_list = ['ELLENBOGEN_NEU', 'HG_NEU']
for bp in bp_list:
    plt.rcParams['figure.figsize'] = [40, 10]
    for i, fracture in enumerate([False, True]):
        scan = df_balanced[(df_balanced['bodypart'] == bp) & (df_balanced['fracture_bool'] == fracture)].sample(1)
        scan_id = scan.index[0]
        scan_frac = scan["fracture"].iloc[0]
        scan_path = scan["path"].iloc[0]
        for j, scale in enumerate([1.0, 0.5, 0.25]):
            pixel_values = pydicom.read_file(scan_path).pixel_array
            pixel_values = cv2.resize(pixel_values, (0, 0), fx=scale, fy=scale)
            pixel_values = pixel_values[:, :, np.newaxis]
            
            print(f"{i*3+j+1}: Scan {scan_id}: path={scan_path}, bp={bp}, fracture={scan_frac}, scale={scale}, shape={pixel_values.shape}")
    
            plt.subplot(1, 8, i*3+j+1)
            show_image(pixel_values, f"{bp}, Fracture: {scan_frac}, Scale: {scale}")
            plt.axis('off')
    plt.show()

# Create Dataset

In [19]:
import sys
sys.path.insert(1, '/home/buehlern/Documents/Masterarbeit/models')
from src.data.mri_datamodule import MRIDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# Reload import
from importlib import reload
reload(sys.modules['src.data.components.mri_dataset'])
from src.data.mri_datamodule import MRIDataModule

In [23]:
# Load the DataModule
batch_bins = [1152, 1536, 1920, 2304, 2688, 3072]
mri_datamodule = MRIDataModule(df_name='df_min_ft_test_114', batch_size=1, output_channels=1, cache=False,
                               fix_inverted=True, batch_binning='smart', batch_bins=batch_bins,
                               stratification_target='fracture', label='fracture_bool')

initializing MRIDatasetBase ...
reading /home/buehlern/Documents/Masterarbeit/data/df_min_ft_test_114.pkl file ...
PATH /home/buehlern/Documents/Masterarbeit/data/BodyPartExamined_mappings_mergemore.json
/home/buehlern/Documents/Masterarbeit/data/cache-full/df_labelcomparison.pkl does not exit --> no items excluded by it
MRIDatasetBase(len=114) initialized

initializing MRIDataset(mode=train) ...
MRIDataset(mode=train, len=91) initialized

initializing MRIDataset(mode=val) ...
MRIDataset(mode=val, len=7) initialized

initializing MRIDataset(mode=test) ...
WARN: including test data
MRIDataset(mode=test, len=16) initialized


In [24]:
data_sources = [mri_datamodule.data_train, mri_datamodule.data_val, mri_datamodule.data_test]
for data_source in data_sources:
    it = iter(data_source)
    frac = 0
    total = len(data_source)
    for i in range(total):
        item = next(it)
        image = item[0]
        label = item[1]
        frac += label
    print(f"Fractures: {frac}/{total}")

Fractures: 46/91
Fractures: 3/7
Fractures: 8/16
