In [1]:
from pathlib import Path
import torch
import pandas as pd
import stat
import numbers
import pydicom
import numpy as np
from tqdm import tqdm
from collections import Counter
import re
import os
from PIL import Image
from matplotlib import pyplot as plt
import cv2
import json

In [2]:
Path().resolve()

PosixPath('/home/buehlern/Documents/Masterarbeit/notebooks')

In [3]:
df_loc = Path('../data/clean_df_slim_frac.pkl')
df = pd.read_pickle(df_loc)

In [4]:
df = df.rename(columns={'examinationid_x': 'examinationid'})

In [5]:
df.columns

Index(['patientid', 'bodypart', 'pixelarr_shape', 'dcm_StudyDate',
       'dcm_SeriesDate', 'dcm_ContentDate', 'dcm_StudyTime', 'dcm_SeriesTime',
       'dcm_AcquisitionTime', 'dcm_ContentTime', 'dcm_SeriesDescription',
       'dcm_PatientID', 'dcm_PatientSex', 'dcm_PatientAge', 'dcm_PatientSize',
       'dcm_PatientWeight', 'dcm_PregnancyStatus', 'dcm_BodyPartExamined',
       'dcm_SpatialResolution', 'dcm_ImagerPixelSpacing',
       'dcm_StudyInstanceUID', 'dcm_SeriesInstanceUID', 'dcm_SeriesNumber',
       'dcm_InstanceNumber', 'dcm_PatientOrientation',
       'dcm_PhotometricInterpretation', 'dcm_PresentationLUTShape', 'pathstr',
       'path', 'findingspath', 'examinationid', 'inverted', 'pixelarr_dtype',
       'pixelarr_shape', 'pixelarr_non0count', 'pixelarr_min', 'pixelarr_max',
       'pixelarr_mean', 'pixelarr_std', 'pixelarr_non0min',
       'pixelarr_non0mean', 'pixelarr_non0std', 'scanid', 'examinationid_y',
       'fracture', 'foreignmaterial', 'fracture_bool', 'foreignm

In [7]:
df['bodypart'].value_counts()

bodypart
KNIE_NEU          169595
SCHULTER_NEU      107070
SG_NEU             81577
ELLENBOGEN_NEU     55914
FUSS_NEU           53385
HG_NEU             43246
HAND_NEU           41201
HWS_NEU            35150
CLAVICULA_NEU      20506
BWS_NEU            13995
LWS_NEU            10756
DX_Schädel_Neu      3164
DX_RIPPEN           2668
SCAPULA_NEU         1650
Name: count, dtype: int64

In [28]:
df.groupby('bodypart')['patientid'].nunique()

bodypart
BWS_NEU            5834
CLAVICULA_NEU      3844
DX_RIPPEN          1261
DX_Schädel_Neu     1122
ELLENBOGEN_NEU    12610
FUSS_NEU          16226
HAND_NEU          15402
HG_NEU            10367
HWS_NEU            9838
KNIE_NEU          39970
LWS_NEU            3272
SCAPULA_NEU         659
SCHULTER_NEU      25314
SG_NEU            23532
Name: patientid, dtype: int64

In [29]:
df.groupby('bodypart')['patientid'].nunique().sum()

np.int64(169251)

In [7]:
df['bodypart'].unique()

array(['BWS_NEU', 'CLAVICULA_NEU', 'DX_RIPPEN', 'DX_Schädel_Neu',
       'ELLENBOGEN_NEU', 'FUSS_NEU', 'HAND_NEU', 'HG_NEU', 'HWS_NEU',
       'KNIE_NEU', 'LWS_NEU', 'SCAPULA_NEU', 'SCHULTER_NEU', 'SG_NEU'],
      dtype=object)

In [9]:
df['bodypart_old'] = df['bodypart']

In [10]:
bodypart_names = {
    "KNIE_NEU": "knee",
    "SCHULTER_NEU": "shoulder",
    "SG_NEU": "ankle",
    "ELLENBOGEN_NEU": "elbow",
    "FUSS_NEU": "foot",
    "HG_NEU": "wrist",
    "HAND_NEU": "hand",
    "HWS_NEU": "cspine",
    "CLAVICULA_NEU": "clavicle",
    "BWS_NEU": "tspine",
    "LWS_NEU": "lspine",
    "DX_Schädel_Neu": "skull",
    "DX_RIPPEN": "rips",
    "SCAPULA_NEU": "scapula"
}
df['bodypart'] = df['bodypart_old'].map(bodypart_names)

In [13]:
df['bodypart'].describe()

count     639877
unique        14
top         knee
freq      169595
Name: bodypart, dtype: object

In [15]:
df['bodypart'].isna().sum()

np.int64(0)

In [16]:
df['dcm_BodyPartExamined'].value_counts()

dcm_BodyPartExamined
KNEE            142342
SHOULDER         74984
HAND             73011
ANKLE            52309
FOOT             49051
                 ...  
PORT L SPINE         2
UNTERKIEFER          2
KOPF                 2
BEIN                 2
H_FTE                1
Name: count, Length: 85, dtype: int64

In [17]:
df['dcm_BodyPartExamined'].unique()

array(['TSPINE', 'T SPINE', 'BWS', 'LSPINE', 'LWS', 'SPINE', 'CSPINE',
       'THORAX', 'CHEST', 'T L SPINE', 'C SPINE', 'THORAX BETT', 'HAND',
       'HWS', 'HIP', 'FOOT', 'PORT L SPINE', 'SHOULDER', 'HUEFTE',
       'L SPINE', 'PELVIS', 'WIRBELSAULE', 'CLAVICLE', 'CLAVICULA',
       'SCHLUESSELBEIN', 'SCHULTER', 'SCHULTERBLATT', 'SCAPULA', 'ELBOW',
       'FOREARM', 'AC JOINT', 'WIRBELSAEULE', 'AC GELENK', 'RIBS',
       'COCCYX', 'RIPPEN', 'SCHAEDEL', 'SKULL', 'SCHADEL',
       'GESICHTSKNOCHEN', 'NNH', 'NASENBEIN', 'ORBITA', 'SINUSES',
       'NASAL BONES', 'ELLENBOGEN', 'UNTERKIEFER', 'LUNGE', 'JAW',
       'ABDOMEN', 'ANKLE', 'ARM', 'OBERARM', 'KNEE', 'HUMERUS',
       'HANDGELENK', 'BECKEN', 'WRIST', 'FUSS', 'KNIE', 'FU_', 'LSG',
       'LEG', 'SPRUNGGELENK', 'FEMUR', 'CALCANEUS', 'TOES',
       'TIBIA FIBULA', 'FERSENBEIN', 'FINGER', 'EXTREMITAT', 'UNTERARM',
       'OBERSCHENKEL', 'THUMB', 'HWS DENS', 'DENS', 'CERVICOTHORACIC',
       'KNIESCHEIBE', 'PATELLA', 'KOPF', nan, 'H_

In [18]:
pd.set_option("display.max_rows", None)
df[['dcm_BodyPartExamined', 'bodypart']].value_counts()

dcm_BodyPartExamined  bodypart
KNEE                  knee        142117
SHOULDER              shoulder     73292
ANKLE                 ankle        51413
ELBOW                 elbow        48769
FOOT                  foot         48494
HAND                  hand         41036
                      wrist        31784
SPRUNGGELENK          ankle        27796
SCHULTER              shoulder     26935
KNIE                  knee         23078
CLAVICLE              clavicle     13618
CSPINE                cspine       12943
HWS                   cspine       11499
C SPINE               cspine        9847
HANDGELENK            wrist         9833
TSPINE                tspine        7667
ELLENBOGEN            elbow         6783
BWS                   tspine        5397
CLAVICULA             clavicle      5078
LWS                   lspine        4529
LSPINE                lspine        4293
SCHULTERBLATT         shoulder      3698
FUSS                  foot          3683
SCAPULA               shou

In [19]:
bodypart_mapping_loc = Path("/home/buehlern/Documents/Masterarbeit/data/BodyPartExamined_mappings_mergemore.json")
bodypartexamined_mapping = json.loads(bodypart_mapping_loc.read_text())
bodypartexamined_mapping

{'': 'shoulder',
 'ankle': 'ankle',
 'arm': None,
 'becken': 'becken',
 'bws': 'tspine',
 'cspine': 'cspine',
 'chest': 'ribs',
 'clavicle': 'clavicle',
 'clavicula': 'clavicle',
 'coccyx': None,
 'elbow': 'elbow',
 'ellenbogen': 'elbow',
 'femur': 'femur',
 'foot': 'foot',
 'fuss': 'foot',
 'hand': 'wrist',
 'handgelenk': 'wrist',
 'hip': 'hip',
 'huefte': 'huefte',
 'hws': 'hws',
 'jaw': 'jaw',
 'knee': 'knee',
 'knie': 'knee',
 'lspine': 'lspine',
 'leg': 'knee',
 'lws': 'lspine',
 'nasenbein': None,
 'nnh': 'nnh',
 'oberarm': 'oberarm',
 'orbita': 'orbita',
 'patella': 'patella',
 'pelvis': 'pelvis',
 'ribs': 'ribs',
 'rippen': 'ribs',
 'sacrum': 'sacrum',
 'scapula': 'shoulder',
 'schaedel': 'skull',
 'schluesselbein': 'clavicle',
 'schulter': 'shoulder',
 'schulterblatt': 'shoulder',
 'shoulder': 'shoulder',
 'skull': 'skull',
 'spine': 'spine',
 'sprunggelenk': 'ankle',
 'tspine': 'tspine',
 'thorax': 'thorax',
 'wrist': 'wrist',
 'tlspine': 'tlspine',
 'thoraxbett': 'thorax',
 

In [20]:
df['dcm_BodyPartExamined'] = [(seq if not seq != seq else '').lower()
                              for seq in df['dcm_BodyPartExamined']]

df['dcm_BodyPartExamined_mapped'] = [bodypartexamined_mapping[BodyPartExamined_str.replace(' ', '').lower()]
                                     for BodyPartExamined_str in df['dcm_BodyPartExamined']]

In [21]:
df['dcm_BodyPartExamined_mapped'].value_counts()

dcm_BodyPartExamined_mapped
knee               168658
shoulder           109549
wrist               84449
ankle               80211
elbow               55638
foot                53338
cspine              22947
clavicle            19302
tspine              13912
hws                 11548
lspine              11004
skull                3053
ribs                 2971
fu_                   820
tibiafibula           477
unterschenkel         446
hwsdens               295
thorax                155
pelvis                149
femur                 100
patella                86
huefte                 85
hip                    71
spine                  46
humerus                45
becken                 42
forearm                37
acjoint                37
oberarm                31
oberschenkel           28
finger                 28
abdomen                25
gesichtsknochen        22
orbita                 19
dens                   17
lunge                  15
fersenbein             15
tlspine   

In [23]:
#min_cols = ['patientid', 'examinationid', 'scanid', 'path', 'pixelarr_shape', 'inverted', 'bodypart', 'dcm_BodyPartExamined',
#            'fracture', 'fracture_bool', 'foreignmaterial', 'foreignmaterial_bool']]
# Only need
# - patientid for stratification
# - path for loading the scan
# - pixelarr_shape for CustomBatchSampler
# - inverted for fixing inverted images
# - bodypart & dcm_BodyPartExamined for stratification
# - fracture for finetuning
# - foreignmaterial for finetuning
min_cols = ['patientid', 'path', 'pixelarr_shape', 'inverted', 'bodypart', 'fracture', 'foreignmaterial']

In [24]:
df_min = df[min_cols]

In [25]:
df_min = df_min.loc[:, ~df_min.columns.duplicated()]

In [26]:
df_min.columns

Index(['patientid', 'path', 'pixelarr_shape', 'inverted', 'bodypart',
       'fracture', 'foreignmaterial'],
      dtype='object')

In [27]:
# Compare sizes
print(f"Memory for slim df: {df.memory_usage(index=True).sum()} bytes")
print(f"Memory for min df: {df_min.memory_usage(index=True).sum()} bytes")

Memory for slim df: 251471789 bytes
Memory for min df: 31354101 bytes


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639877 entries, 0 to 639876
Data columns (total 50 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   patientid                      639877 non-null  object 
 1   bodypart                       639877 non-null  object 
 2   pixelarr_shape                 639877 non-null  object 
 3   dcm_StudyDate                  639877 non-null  object 
 4   dcm_SeriesDate                 639848 non-null  object 
 5   dcm_ContentDate                639852 non-null  object 
 6   dcm_StudyTime                  639877 non-null  object 
 7   dcm_SeriesTime                 639848 non-null  object 
 8   dcm_AcquisitionTime            639853 non-null  object 
 9   dcm_ContentTime                639852 non-null  object 
 10  dcm_SeriesDescription          639872 non-null  object 
 11  dcm_PatientID                  639877 non-null  object 
 12  dcm_PatientSex                

In [29]:
df_min.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639877 entries, 0 to 639876
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   patientid        639877 non-null  object
 1   path             639877 non-null  object
 2   pixelarr_shape   639877 non-null  object
 3   inverted         639877 non-null  bool  
 4   bodypart         639877 non-null  object
 5   fracture         673 non-null     object
 6   foreignmaterial  673 non-null     object
dtypes: bool(1), object(6)
memory usage: 29.9+ MB


In [30]:
df_min_loc = Path('../data/df_min.pkl')
pd.to_pickle(df_min, df_min_loc)

# Compare Loading Speed

In [31]:
%%time
df_loc = Path('../data/clean_df_slim_frac.pkl')
df = pd.read_pickle(df_loc)

CPU times: user 19.8 s, sys: 1.67 s, total: 21.5 s
Wall time: 21.4 s


In [32]:
%%time
df_min_loc = Path('../data/df_min.pkl')
df_min = pd.read_pickle(df_min_loc)

CPU times: user 7.16 s, sys: 89.3 ms, total: 7.25 s
Wall time: 7.23 s
