In [1]:
from pathlib import Path
import torch
import pandas as pd
import stat
import numbers
import pydicom
import numpy as np
from tqdm import tqdm
from collections import Counter
import re
import os
from PIL import Image
from matplotlib import pyplot as plt
import cv2

In [2]:
Path().resolve()

PosixPath('/home/buehlern/Documents/Masterarbeit/notebooks')

In [3]:
df_loc = Path('../data/clean_df_slim_frac.pkl')
df = pd.read_pickle(df_loc)

In [4]:
df = df.rename(columns={'examinationid_x': 'examinationid'})

In [5]:
df.columns

Index(['patientid', 'bodypart', 'pixelarr_shape', 'dcm_StudyDate',
       'dcm_SeriesDate', 'dcm_ContentDate', 'dcm_StudyTime', 'dcm_SeriesTime',
       'dcm_AcquisitionTime', 'dcm_ContentTime', 'dcm_SeriesDescription',
       'dcm_PatientID', 'dcm_PatientSex', 'dcm_PatientAge', 'dcm_PatientSize',
       'dcm_PatientWeight', 'dcm_PregnancyStatus', 'dcm_BodyPartExamined',
       'dcm_SpatialResolution', 'dcm_ImagerPixelSpacing',
       'dcm_StudyInstanceUID', 'dcm_SeriesInstanceUID', 'dcm_SeriesNumber',
       'dcm_InstanceNumber', 'dcm_PatientOrientation',
       'dcm_PhotometricInterpretation', 'dcm_PresentationLUTShape', 'pathstr',
       'path', 'findingspath', 'examinationid', 'inverted', 'pixelarr_dtype',
       'pixelarr_shape', 'pixelarr_non0count', 'pixelarr_min', 'pixelarr_max',
       'pixelarr_mean', 'pixelarr_std', 'pixelarr_non0min',
       'pixelarr_non0mean', 'pixelarr_non0std', 'scanid', 'examinationid_y',
       'fracture', 'foreignmaterial', 'fracture_bool', 'foreignm

In [6]:
df['bodypart'].value_counts()

bodypart
KNIE_NEU          169595
SCHULTER_NEU      107070
SG_NEU             81577
ELLENBOGEN_NEU     55914
FUSS_NEU           53385
HG_NEU             43246
HAND_NEU           41201
HWS_NEU            35150
CLAVICULA_NEU      20506
BWS_NEU            13995
LWS_NEU            10756
DX_Schädel_Neu      3164
DX_RIPPEN           2668
SCAPULA_NEU         1650
Name: count, dtype: int64

In [7]:
df['dcm_BodyPartExamined'].value_counts()

dcm_BodyPartExamined
KNEE            142342
SHOULDER         74984
HAND             73011
ANKLE            52309
FOOT             49051
                 ...  
PORT L SPINE         2
UNTERKIEFER          2
KOPF                 2
BEIN                 2
H_FTE                1
Name: count, Length: 85, dtype: int64

In [9]:
#min_cols = ['patientid', 'examinationid', 'scanid', 'path', 'pixelarr_shape', 'inverted', 'bodypart', 'dcm_BodyPartExamined',
#            'fracture', 'fracture_bool', 'foreignmaterial', 'foreignmaterial_bool']]
# Only need
# - patientid for stratification
# - path for loading the scan
# - pixelarr_shape for CustomBatchSampler
# - inverted for fixing inverted images
# - bodypart & dcm_BodyPartExamined for stratification
# - fracture for finetuning
# - foreignmaterial for finetuning
min_cols = ['patientid', 'path', 'pixelarr_shape', 'inverted', 'bodypart', 'dcm_BodyPartExamined', 'fracture', 'foreignmaterial']

In [10]:
df_min = df[min_cols]

In [11]:
df_min = df_min.loc[:, ~df_min.columns.duplicated()]

In [12]:
df_min.columns

Index(['patientid', 'path', 'pixelarr_shape', 'inverted', 'bodypart',
       'dcm_BodyPartExamined', 'fracture', 'foreignmaterial'],
      dtype='object')

In [13]:
# Compare sizes
print(f"Memory for slim df: {df.memory_usage(index=True).sum()} bytes")
print(f"Memory for min df: {df_min.memory_usage(index=True).sum()} bytes")

Memory for slim df: 241233757 bytes
Memory for min df: 36473117 bytes


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639877 entries, 0 to 639876
Data columns (total 48 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   patientid                      639877 non-null  object 
 1   bodypart                       639877 non-null  object 
 2   pixelarr_shape                 639877 non-null  object 
 3   dcm_StudyDate                  639877 non-null  object 
 4   dcm_SeriesDate                 639848 non-null  object 
 5   dcm_ContentDate                639852 non-null  object 
 6   dcm_StudyTime                  639877 non-null  object 
 7   dcm_SeriesTime                 639848 non-null  object 
 8   dcm_AcquisitionTime            639853 non-null  object 
 9   dcm_ContentTime                639852 non-null  object 
 10  dcm_SeriesDescription          639872 non-null  object 
 11  dcm_PatientID                  639877 non-null  object 
 12  dcm_PatientSex                

In [15]:
df_min.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 639877 entries, 0 to 639876
Data columns (total 8 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   patientid             639877 non-null  object
 1   path                  639877 non-null  object
 2   pixelarr_shape        639877 non-null  object
 3   inverted              639877 non-null  bool  
 4   bodypart              639877 non-null  object
 5   dcm_BodyPartExamined  639867 non-null  object
 6   fracture              673 non-null     object
 7   foreignmaterial       673 non-null     object
dtypes: bool(1), object(7)
memory usage: 34.8+ MB


In [16]:
df_min_loc = Path('../data/df_min.pkl')
pd.to_pickle(df_min, df_min_loc)

# Compare Loading Speed

In [18]:
%%time
df_loc = Path('../data/clean_df_slim_frac.pkl')
df = pd.read_pickle(df_loc)

CPU times: user 19.8 s, sys: 1.62 s, total: 21.5 s
Wall time: 21.4 s


In [19]:
%%time
df_min_loc = Path('../data/df_min.pkl')
df_min = pd.read_pickle(df_min_loc)

CPU times: user 7.17 s, sys: 125 ms, total: 7.3 s
Wall time: 7.27 s
