In [1]:
from pathlib import Path
import torch
import pandas as pd
import stat
import numbers
import pydicom
import numpy as np
from tqdm import tqdm
from collections import Counter
import re
import os
from PIL import Image
from matplotlib import pyplot as plt
from pydicom_PIL import get_PIL_image

In [2]:
Path().resolve()

PosixPath('/home/buehlern/Documents/Masterarbeit/notebooks')

In [3]:
pd.options.display.max_columns = None

# Load Clean Slim Dataframe

In [4]:
df_loc = Path('cache/clean_df_slim.pkl')
df = pd.read_pickle(df_loc)

In [None]:
df

# Load Fracture Data

In [22]:
ann_loc = Path('../data/fractures/smallbackup_2023-03-31T11_53_51.602920.csv')
df_frac = pd.read_csv(ann_loc)
df_frac = df_frac.drop(['Unnamed: 0'], axis=1)

In [None]:
df_frac

## Formatting

In [None]:
df_frac['path'][0]

In [25]:
print(df['path'][3])
print(df['patientid'][3])
print(df['examinationid'][3])

/home/buehlern/neocortex-nas/shared/Skelett/BWS_NEU/-0XIAZI2mBM/9xmAF5tniCk/1/1.2.840.113654.2.70.1.196426120919369109555468638234200693130
-0XIAZI2mBM
9xmAF5tniCk


In [26]:
df_frac['patientid'] = df_frac['path'].apply(lambda path: path.split('/')[6])
df_frac['examinationid'] = df_frac['path'].apply(lambda path: path.split('/')[7])
df_frac['scanid'] = df_frac['path'].apply(lambda path: path.split('/')[-1])

In [None]:
df_frac

In [28]:
df_frac['scanid'].describe()

count                                                 48434
unique                                                48434
top       1.2.840.113654.2.70.1.310053305240369655195912...
freq                                                      1
Name: scanid, dtype: object

In [29]:
df_frac = df_frac.dropna(subset=['fracture', 'foreignmaterial'])

In [None]:
df_frac

In [None]:
df_frac.describe()

In [32]:
df_frac['fracture'].value_counts()

fracture
NO        565
YES       248
Unsure     30
Name: count, dtype: int64

In [33]:
df_frac['foreignmaterial'].value_counts()

foreignmaterial
NO        541
YES       301
Unsure      1
Name: count, dtype: int64

In [34]:
df_frac[['fracture', 'foreignmaterial']].value_counts()

fracture  foreignmaterial
NO        NO                 379
          YES                185
YES       NO                 145
          YES                103
Unsure    NO                  17
          YES                 13
NO        Unsure               1
Name: count, dtype: int64

## Replace with Booleans

In [36]:
df_frac.loc[df_frac.fracture == "Unsure", "fracture_bool"] = None
df_frac.loc[df_frac.fracture == "YES", "fracture_bool"] = True
df_frac.loc[df_frac.fracture == "NO", "fracture_bool"] = False

In [37]:
df_frac.loc[df_frac.foreignmaterial == "Unsure", "foreignmaterial_bool"] = None
df_frac.loc[df_frac.foreignmaterial == "YES", "foreignmaterial_bool"] = True
df_frac.loc[df_frac.foreignmaterial == "NO", "foreignmaterial_bool"] = False

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_frac.loc[df_frac.foreignmaterial == "Unsure", "foreignmaterial_bool"] = None


In [None]:
df_frac

In [39]:
df_frac['fracture_bool'].value_counts()

fracture_bool
False    565
True     248
Name: count, dtype: int64

In [40]:
df_frac['foreignmaterial_bool'].value_counts()

foreignmaterial_bool
False    541
True     301
Name: count, dtype: int64

In [42]:
df_frac_slim = df_frac[['scanid', 'examinationid', 'fracture', 'foreignmaterial', 'fracture_bool', 'foreignmaterial_bool']]
df_frac_slim

Unnamed: 0,scanid,examinationid,fracture,foreignmaterial,fracture_bool,foreignmaterial_bool
0,1.2.840.113654.2.70.1.259820485975504568799334...,PP6hRs400J8,NO,YES,False,True
1,1.2.840.113654.2.70.1.287135815407759307679765...,PP6hRs400J8,NO,YES,False,True
2,1.2.840.113654.2.70.1.100761906177290725786031...,rMDGKfHOySk,YES,YES,True,True
3,1.2.840.113654.2.70.1.570016009747533316311293...,rMDGKfHOySk,YES,YES,True,True
4,1.2.840.113654.2.70.1.293498675522405851986251...,axL5szVPUkI,YES,NO,True,False
...,...,...,...,...,...,...
841,1.2.840.113654.2.70.1.159815236409370546266677...,GyxqcJuApu0,NO,YES,False,True
842,1.2.840.113654.2.70.1.844534313666141052830780...,-LW5i52hxyA,NO,YES,False,True
843,1.2.840.113654.2.70.1.209186589396631626492723...,-LW5i52hxyA,NO,YES,False,True
844,1.2.840.113654.2.70.1.186750878007311183242295...,hDjCW_yVAp4,NO,NO,False,False


In [43]:
df_frac_slim.describe()

Unnamed: 0,scanid,examinationid,fracture,foreignmaterial,fracture_bool,foreignmaterial_bool
count,843,843,843,843,813,842
unique,843,405,3,3,2,2
top,1.2.840.113654.2.70.1.296445431319389318607502...,0FGYfiDm9j8,NO,NO,False,False
freq,1,7,565,541,565,541


## Join into main DataFrame

Join by scanid instead of examinationid, as the fracture/foreign object might not be visible on all scans of the examination where it was found.

In [49]:
df['scanid'] = df['path'].apply(lambda path: str(path).split('/')[-1])

In [None]:
df_full = pd.merge(df, df_frac_slim, left_on='scanid', right_on='scanid', how='left')
df_full

In [56]:
df_full['fracture_bool'].value_counts()

fracture
NO        441
YES       212
Unsure     20
Name: count, dtype: int64

In [57]:
df_full['foreignmaterial'].value_counts()

foreignmaterial
NO        442
YES       230
Unsure      1
Name: count, dtype: int64

# Saving Results

In [55]:
df_full_loc = Path('cache/clean_df_slim_frac.pkl')
pd.to_pickle(df_full, df_full_loc)

# Inspect Distribution across Splits

In [1]:
import sys
sys.path.insert(1, '/home/buehlern/Documents/Masterarbeit/models')
from src.data.mri_datamodule import MRIDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
mri_datamodule = MRIDataModule(image_size = 3072, square = True, output_channels = 1, cache = False, fix_inverted = True)

initializing MRIDatasetBase ...
reading /home/buehlern/Documents/Masterarbeit/data/clean_df_slim_frac.pkl file ...
PATH /home/buehlern/Documents/Masterarbeit/data/BodyPartExamined_mappings_mergemore.json
/home/buehlern/Documents/Masterarbeit/data/cache-full/df_labelcomparison.pkl does not exit --> no items excluded by it
MRIDatasetBase(len=639877) initialized

initializing MRIDataset(mode=train) ...
MRIDataset(mode=train, len=516402) initialized

initializing MRIDataset(mode=val) ...
MRIDataset(mode=val, len=27518) initialized

initializing MRIDataset(mode=test) ...
WARN: including test data
MRIDataset(mode=test, len=95957) initialized


In [16]:
mri_datamodule.data_train.df['fracture_bool'].describe()

count       500
unique        2
top       False
freq        349
Name: fracture_bool, dtype: object

In [13]:
mri_datamodule.data_val.df['fracture_bool'].describe()

count        26
unique        2
top       False
freq         14
Name: fracture_bool, dtype: object

In [14]:
mri_datamodule.data_test.df['fracture_bool'].describe()

count       127
unique        2
top       False
freq         78
Name: fracture_bool, dtype: object

Thus, the pretraining split (80/5/15) is roughly resembled in this downstream label split (77/4/19).
Using the same split for finetuning prevents problems regarding mixing of training and test data across tasks.