<a href="https://colab.research.google.com/github/mille055/AIPI540_individual_project/blob/main/notebooks/AIPI540_IP_fusion_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<a href='https://ai.meng.duke.edu'> = <img align="left" style="padding-top:10px;" src=https://storage.googleapis.com/aipi_datasets/Duke-AIPI-Logo.png>

In [2]:
COLAB_FLAG = False   # whether running on colab or locally on computer

In [3]:
if COLAB_FLAG:
    !pip install pydicom==2.1.2
    !pip install monai seaborn
    !git clone 'https://github.com/mille055/AIPI540_individual_project.git'

    


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import torchvision
import pydicom
import monai
import pickle
import glob
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, GroupKFold, cross_val_score, cross_validate, GroupShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score, plot_confusion_matrix, ConfusionMatrixDisplay

import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import time
import os
import copy
from datetime import datetime
from pprint import pprint


import monai
from monai.data import DataLoader, ImageDataset
from monai.transforms import AddChannel, Compose, RandRotate90, Resize, ScaleIntensity, EnsureType
from pydicom.dataset import Dataset as DcmDataset
from pydicom.tag import BaseTag as DcmTag
from pydicom.multival import MultiValue as DcmMultiValue
import sys
import warnings
warnings.filterwarnings("ignore")

In [110]:
#local imports
if COLAB_FLAG:
    sys.path.append('/content/AIPI540_individual_project/scripts/')

else: # running locally
    sys.path.append('/Users/cmm/Documents/GitHub/AIPI540_individual_project/scripts/')

### local imports ###
from config import file_dict, feats, column_lists
from config import abd_label_dict, val_list, train_val_split_percent, random_seed, data_transforms
from config import sentence_encoder, series_description_column, text_label
from utils import *
from train_pixel_model import train_pix_model, test_pix_model

# from AIPI540_individual_project.scripts.train_pixel_model import train_model
# from AIPI540_individual_project.scripts.train_text_model import load_text_data, train_text_model, list_incorrect_text_predictions
# from AIPI540_individual_project.scripts.utils import *

# Load Datasets


In [6]:
if COLAB_FLAG:
    train_datafile = '/content/AIPI540_individual_project/data/trainfiles.csv'
    test_datafile = '/content/AIPI540_individual_project/data/testfiles.csv'
else:
    # for local
    train_datafile = '../data/trainfiles.csv'
    test_datafile = '../data/testfiles.csv'

In [None]:
# def load_csv_dataset(train_file, test_file, val = True, val_lists = None):
#     train_df = pd.read_csv(train_file)
#     test_df = pd.read_csv(test_file)
#     train_df.drop('Unnamed: 0', axis=1, inplace=True)
#     test_df.drop('Unnamed: 0', axis=1, inplace=True)
#     if val:
#         if val_lists:
#             val_df = train_df[train_df.patientID.isin(val_lists)]
#             train_df = train_df[~train_df.index.isin(val_df.index)] 
#         else:
#             train_set, val_set = next(GroupShuffleSplit(test_size=.20, n_splits=1, random_state = 42).split(train_df, groups=train_df['patientID']))
#             train_df, val_df = train_set, val_set
#         return train_df, val_df, test_df

#     else: 
#         return train_df, test_df


In [7]:
train, val, test = load_csv_dataset(train_datafile, test_datafile, val_lists = val_list)
train = shorten_df(train)
val = shorten_df(val)
test = shorten_df(test)

In [8]:

train


Unnamed: 0,label,patientID,series,file_info
0,8,104,18(35-68),/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/18 (35-68)/0052.dcm
1,8,104,18(1-34),/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/18 (1-34)/0018.dcm
2,16,104,20,/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/20/0037.dcm
3,9,104,9,/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/9/0017.dcm
4,9,104,11,/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/11/0017.dcm
...,...,...,...,...
1407,7,94,3,/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/3/0015.dcm
1408,2,94,12,/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/12/0037.dcm
1409,2,94,13,/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/13/0037.dcm
1410,19,94,5,/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/5/0037.dcm


In [None]:
# import importlib
# import utils

# importlib.reload(utils)

In [9]:
def prepare_df(df):
    df1 = df.copy()
    filenames = df1.file_info.tolist()
    getdicoms = pd.DataFrame.from_dicoms(filenames)
    merged = getdicoms.merge(df1, left_on='fname', right_on='file_info')
    merged.drop(columns=['file_info'], inplace=True)
    
    # Pooling the labels 2-5 together as all arterial phase and difficult to reliably
    # distinguish even for trained person (high interobserver variability)
    artpooled = pool_arterial_labels(merged)
    artpooled['contrast'] = artpooled.apply(detect_contrast, axis=1)
    artpooled['plane'] = artpooled.apply(compute_plane, axis=1)
    # Keep only the rows where the 'label' column value is not in the excluded list
    # which contains research-only series and others with less than 5 instances in the dataset
    df_filtered = artpooled[~artpooled['label'].isin([21,22,26,27,28,29])] 

    return artpooled

In [10]:

train_df = prepare_df(train)
val_df = prepare_df(val)
test_df = prepare_df(test)



In [11]:
train_df

Unnamed: 0,ImageType,SOPClassUID,SeriesDescription,ContrastBolusAgent,BodyPartExamined,ScanningSequence,SequenceVariant,ScanOptions,MRAcquisitionType,AngioFlag,...,PhotometricInterpretation,PixelSpacing,fname,TriggerTime,InversionTime,label,patientID,series,contrast,plane
0,"[DERIVED, PRIMARY, DIFFUSION, TRACEW, DIS2D]",MR Image Storage,ax diff_TRACEW,Multihance,ABDOMEN,EP,"[SK, SP]","[PFP, SFS]",2D,N,...,MONOCHROME2,"[0.9375, 0.9375]",/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/18 (35-68)/0052.dcm,,,8,104,18(35-68),1,ax
1,"[DERIVED, PRIMARY, DIFFUSION, TRACEW, DIS2D]",MR Image Storage,ax diff_TRACEW,Multihance,ABDOMEN,EP,"[SK, SP]","[PFP, SFS]",2D,N,...,MONOCHROME2,"[0.9375, 0.9375]",/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/18 (1-34)/0018.dcm,,,8,104,18(1-34),1,ax
2,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax equilibrium,Multihance,ABDOMEN,GR,"[SP, OSP]","[PFP, FS]",3D,N,...,MONOCHROME2,"[0.703125, 0.703125]",/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/20/0037.dcm,,,16,104,20,1,ax
3,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax haste bh,,ABDOMEN,SE,"[SK, SP, OSP]","[PFP, SAT2, SFS]",2D,N,...,MONOCHROME2,"[1.5625, 1.5625]",/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/9/0017.dcm,,,9,104,9,0,ax
4,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax t2 triggerred,,ABDOMEN,SE,"[SK, SP]","[PFP, SAT2, FS]",2D,N,...,MONOCHROME2,"[1.5625, 1.5625]",/volumes/cm7/Abdominal_MRI_dataset_split/train/104/exam1/11/0017.dcm,,,9,104,11,0,ax
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1407,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,cor haste,,ABDOMEN,SE,"[SK, SP, OSP]",PFP,2D,N,...,MONOCHROME2,"[1.5625, 1.5625]",/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/3/0015.dcm,,,7,94,3,0,cor
1408,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax dyn,Multihance,ABDOMEN,GR,"[SP, OSP]","[PFP, FS]",3D,N,...,MONOCHROME2,"[0.78125, 0.78125]",/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/12/0037.dcm,,,2,94,12,1,ax
1409,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax dyn,Multihance,ABDOMEN,GR,"[SP, OSP]","[PFP, FS]",3D,N,...,MONOCHROME2,"[0.78125, 0.78125]",/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/13/0037.dcm,,,2,94,13,1,ax
1410,"[ORIGINAL, PRIMARY, M, NORM, DIS2D]",MR Image Storage,ax dixon_opp,,ABDOMEN,GR,SP,"[PFP, SAT2]",3D,N,...,MONOCHROME2,"[1.5625, 1.5625]",/volumes/cm7/Abdominal_MRI_dataset_split/train/094/exam1/5/0037.dcm,,,19,94,5,0,ax


In [12]:
X_train_meta = preprocess(train_df)
X_val_meta = preprocess(val_df)
X_test_meta = preprocess(test_df)
y = train_df.label


Preprocessing metadata for Random Forest classifier.
Have received 1412 entries.
Preprocessing metadata for Random Forest classifier.
Have received 354 entries.
Preprocessing metadata for Random Forest classifier.
Have received 446 entries.


In [24]:
X_test_meta[feats]

Unnamed: 0,MRAcquisitionType,AngioFlag,SliceThickness,RepetitionTime,EchoTime,EchoTrainLength,PixelSpacing,ContrastBolusAgent,InversionTime,DiffusionBValue,...,opt_T2FLAIR_GEMS,opt_TRF_GEMS,opt_VASCTOF_GEMS,opt_VB_GEMS,opt_W,opt_X,opt__,type_ADC,type_DIFFUSION,type_DERIVED
0,0,1,0.057471,0.388305,0.046865,0.239216,0.87500,1,0.0,0,...,0,0,0,0,0,0,0,0,1,1
1,0,1,0.057471,0.388305,0.046865,0.239216,0.87500,1,0.0,0,...,0,0,0,0,0,0,0,0,1,1
2,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0.034483,0.000067,0.000000,0.003922,0.56250,0,0.0,0,...,0,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0,1,0.057471,0.351885,0.046865,0.239216,0.81250,1,0.0,0,...,0,0,0,0,0,0,0,0,1,1
442,0,1,0.057471,0.060522,0.066145,0.313725,0.56250,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0
443,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
444,1,1,0.034483,0.000067,0.000949,0.003922,0.56250,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:


feats_to_keep = [col for col in X_train_meta[feats].columns if X_train_meta[col].nunique() > 1]

In [30]:
X_test_meta[feats_to_keep]

Unnamed: 0,MRAcquisitionType,AngioFlag,SliceThickness,RepetitionTime,EchoTime,EchoTrainLength,PixelSpacing,ContrastBolusAgent,InversionTime,seq_E,...,opt_SEQ_GEMS,opt_SP,opt_T,opt_TRF_GEMS,opt_VASCTOF_GEMS,opt_W,opt_X,type_ADC,type_DIFFUSION,type_DERIVED
0,0,1,0.057471,0.388305,0.046865,0.239216,0.87500,1,0.0,1,...,0,0,0,0,0,0,0,0,1,1
1,0,1,0.057471,0.388305,0.046865,0.239216,0.87500,1,0.0,1,...,0,0,0,0,0,0,0,0,1,1
2,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0.034483,0.000067,0.000000,0.003922,0.56250,0,0.0,0,...,0,0,0,0,0,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441,0,1,0.057471,0.351885,0.046865,0.239216,0.81250,1,0.0,1,...,0,0,0,0,0,0,0,0,1,1
442,0,1,0.057471,0.060522,0.066145,0.313725,0.56250,0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
443,1,1,0.034483,0.000064,0.000077,0.000000,0.09375,1,0.0,0,...,0,0,0,0,0,0,0,0,0,0
444,1,1,0.034483,0.000067,0.000949,0.003922,0.56250,0,0.0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
#### train metadata model code to repo...



## Get Inferences from Saved Models

In [14]:
def meta_inference(df, model, feature_list=feats):
    X = preprocess(df)[feature_list]
    y = df.label
    preds = model.predict(X)
    probs = model.predict_proba(X)
    acc = np.sum(preds==y)/len(y)


    return preds, probs, y, acc
    

In [15]:
# Load the pickled model
with open('../models/meta_04062023_optmodel.skl', 'rb') as file:
    meta_model = pickle.load(file)



In [16]:
Tpreds, Tprobs, Ty, Tacc = meta_inference(train_df, meta_model, feats)
Vpreds, Vprobs, Vy, Vacc = meta_inference(val_df, meta_model, feats)
TEpreds, TEprobs, TEy, TEacc = meta_inference(test_df, meta_model, feats)

Preprocessing metadata for Random Forest classifier.
Have received 1412 entries.
Preprocessing metadata for Random Forest classifier.
Have received 354 entries.
Preprocessing metadata for Random Forest classifier.
Have received 446 entries.


In [17]:
def make_meta_results_df(preds, probs, true, df):
    return pd.DataFrame({'preds': preds, 'true': true, 'probs': [row.tolist() for row in probs], 'patientID': df['patientID'], 'series_description': df['SeriesDescription'], 'contrast': df['contrast'], 'plane': df['plane']  })



In [64]:
classes = meta_model.classes_

In [19]:
class_text_labels = [abd_label_dict[str(x)]['short'] for x in meta_model.classes_]
class_text_labels

['adc',
 'other',
 'arterial',
 'dynamic_late',
 't2',
 'dwi',
 't2',
 'dwi',
 'dixon_fat',
 'hepatobiliary',
 'hepatobiliary',
 'in_phase',
 'in_phase',
 'dynamic_equilibrium',
 'loc',
 'mrcp',
 'opposed_phase',
 'opposed_phase',
 'fat_quant',
 'water_fat_quant',
 'portal_venous',
 't1_fat_sat',
 'r_star_2',
 'ssfse',
 'venous_sub']

In [20]:

train_results_df = make_meta_results_df(Tpreds, Tprobs, Ty, train_df)
val_results_df = make_meta_results_df(Vpreds, Vprobs, Vy, val_df)
test_results_df = make_meta_results_df(TEpreds, TEprobs, TEy, test_df)
meta_vector = Tprobs

In [21]:
meta_vector

array([[1.47947497e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.45661157e-05],
       [1.47947497e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 6.45661157e-05],
       [0.00000000e+00, 9.51069133e-05, 7.69871361e-02, ...,
        0.00000000e+00, 0.00000000e+00, 6.89068573e-04],
       ...,
       [0.00000000e+00, 4.91932310e-05, 9.26785777e-01, ...,
        0.00000000e+00, 2.61521918e-04, 1.54297216e-04],
       [9.08182726e-05, 0.00000000e+00, 5.35009992e-03, ...,
        8.93046348e-04, 1.80592615e-03, 1.92321048e-04],
       [0.00000000e+00, 4.91932310e-05, 9.26785777e-01, ...,
        0.00000000e+00, 2.61521918e-04, 1.54297216e-04]])

In [22]:
print(classification_report(TEpreds, TEy))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        21
           2       0.59      0.56      0.58        64
           6       0.82      0.22      0.34        65
           7       1.00      0.96      0.98        23
           8       1.00      1.00      1.00        43
           9       1.00      0.98      0.99        43
          11       0.76      0.96      0.85        23
          12       0.00      0.00      0.00         2
          13       0.67      1.00      0.80         2
          14       1.00      1.00      1.00        23
          16       0.09      0.25      0.13         8
          17       1.00      1.00      1.00        21
          18       0.93      1.00      0.96        26
          19       0.65      0.75      0.70        20
          21       1.00      0.36      0.53        14
          22       0.00      0.00      0.00         0
          23       0.00      0.00      0.00         0
          25       0.70    

In [None]:
from datetime import datetime
cm = confusion_matrix(TEy, TEpreds)

# yvals= sorted(y.unique())
    
# labels = yvals
# target_names = [abd_label_dict[str(x)] ['short'] for x in yvals]

plt.figure(figsize=(25, 25))
plt.tight_layout()
ConfusionMatrixDisplay(cm, display_labels=meta_model.classes_).plot(xticks_rotation = 'vertical', cmap='Blues')
plt.savefig("../assets/FigCM_meta"+datetime.today().strftime('%Y%m%d')+".tif",dpi=300, bbox_inches = 'tight')

In [59]:
# ## Get pixel-based model 

# # trying to unpickle

# # Check if GPU is available
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Load the pickled model
# with open('../models/pixel_model_single_img_032123.pkl', 'rb') as f:
#     model = pickle.load(f)

# # Move the model to the appropriate device
# model.to(device)



In [102]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = models.resnet50(pretrained=True) # Load the ResNet50 model 

# Replace the output layer to match the number of output units in your fine-tuned model
num_finetuned_output_units = 30
num_features = model.fc.in_features
model.fc = torch.nn.Linear(num_features, num_finetuned_output_units)

# Load the saved state_dict
state_dict = torch.load('../models/pixel_model_041123.pth', map_location=device)
model.load_state_dict(state_dict)

# Move the model to the appropriate device
model = model.to(device)  


Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /Users/cmm/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth
100%|██████████| 97.8M/97.8M [00:10<00:00, 10.1MB/s]


In [116]:
def image_to_tensor(filepath):
    # Define the transformations to match the ones used during training/evaluation of the test dataset
#     test_transform = transforms.Compose([
#         transforms.ToPILImage(),
#         transforms.Resize(299),
#         transforms.CenterCrop(299),
#         transforms.Grayscale(num_output_channels=3),
#         transforms.ToTensor(),
#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Normalize the image using the same mean and std values as during training
# ])

    # Load an image and apply the transformations
    #image = Image.open(filepath)
    ds = pydicom.dcmread(filepath)
    img = np.array(ds.pixel_array, dtype=np.float32)
    img = img[np.newaxis]
    img = torch.from_numpy(np.asarray(img))
    input_tensor = data_transforms['test'](img)

    # Add a batch dimension to the input tensor
    input_tensor = input_tensor.unsqueeze(0)
    print('changing input_tensor to shape', input_tensor.shape)
    # Move the input tensor to the appropriate device
    input_tensor = input_tensor.to(device)

    return input_tensor




In [123]:
def get_pixel_preds_and_probs(model, filelist, classes=classes):
    model = model.to(device)
    # Turn autograd off
    with torch.no_grad():
        model.eval()

    preds = []
    probs = []
    count = 0
    for file in filelist:
        print('on item ', count, file)
        each_tensor = image_to_tensor(file)
        #visualization of a batch of images
        each_tensor = each_tensor.to(device)
        # Feed inputs through model to get raw scores
        logits = model.forward(each_tensor)
        
        
        prob = torch.softmax(logits, dim=1)
        prob = prob.detach().cpu().numpy()
        #print(labels, probs)
        # Get discrete predictions using argmax
        pred = np.argmax(prob)
        # Add predictions and actuals to lists
        preds.extend(pred)
        probs.extend(prob)
        
       
        # logits = model.forward(each_tensor)
        # # Convert the output to a desired format 
        # output_numpy = output.detach().numpy()
        
        # # Apply the softmax function to convert logits to probabilities
        # probabilities = np.exp(output_numpy) / np.sum(np.exp(output_numpy), axis=1, keepdims=True)

        # # Find the class predictions
        # pred = classes[np.argmax(probabilities, axis=1)]
        # print('predicted class is ', pred)
        # # Store probabilities and predictions
        # probs.append(probabilities)
        # preds.append(pred)

        count+= 1
    # Convert lists to numpy arrays
    predictions_array = np.array(preds).flatten()
    probabilities_array = np.array(probs).squeeze()

    # Print the predictions and probabilities arrays
    print("Predictions array:", predictions_array)
    print("Probabilities array:", probabilities_array)


    return predictions_array, probabilities_array



In [124]:
preds, probs = get_pixel_preds_and_probs(model, test_df.fname.tolist())
preds

on item  0 /volumes/cm7/Abdominal_MRI_dataset_split/test/102/exam1/16 (1-40)/0021.dcm
changing input_tensor to shape torch.Size([1, 3, 299, 299])


TypeError: 'numpy.int64' object is not iterable

NameError: name 'test_loader' is not defined

In [None]:
# old_version_cols = old_version.columns
# new_version_cols = train_df.columns

# #diff = list(set(old_version_cols) ^ set(new_version_cols))
# #[print(x) for x in diff]

# omn = set(old_version_cols)-set(new_version_cols)
# [print(x) for x in omn]


In [131]:
len(test[test['patientID']==24].series.tolist())


29

In [None]:
# fns = train.file_info.tolist()

# # Create an empty list to store the DICOM metadata
# data = []

# # Loop over the DICOM file names and extract the metadata
# for file_name in file_names:
#     # Read in the DICOM file
#     ds = pydicom.dcmread(file_name)
    
#     d = {}
#     for col in dicom_cols:
#         d.key = col
#         d.value = ds['col']
#     # Store the metadata in a dictionary
#     d = {
#         'patient_id': patient_id,
#         'study_date': study_date,
#         # ... add other metadata as needed ...
#     }
    
#     # Append the dictionary to the data list
#     data.append(d)

# # Create a DataFrame from the data list
# df = pd.DataFrame(data)
# This code reads in a list of DICOM file names, then loops over the files and extracts the metadata of interest using pydicom. It stores the metadata in a dictionary, then appends the dictionary to a list of dictionaries (data). Finally, it creates a Pandas DataFrame from the list of dictionaries using pd.DataFrame(data). You can modify this code to extract the specific metadata that you are interested in from your DICOM files.








In [134]:
full_train, full_val, full_test = load_csv_dataset(train_datafile, test_datafile, val_lists = val_list)
example_dicom_full = full_test[full_test.patientID==24]

In [135]:
example_dicom_full

Unnamed: 0,file_info,label,patientID,series,img_num
1046,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/32/0010.dcm,9,24,32,10
1047,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/32/0004.dcm,9,24,32,4
1048,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/32/0005.dcm,9,24,32,5
1049,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/32/0011.dcm,9,24,32,11
1050,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/32/0007.dcm,9,24,32,7
...,...,...,...,...,...
2808,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/25/0034.dcm,25,24,25,34
2809,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/25/0020.dcm,25,24,25,20
2810,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/25/0021.dcm,25,24,25,21
2811,/volumes/cm7/Abdominal_MRI_dataset_split/test/024/exam1/25/0035.dcm,25,24,25,35


In [187]:

from pydicom.datadict import dictionary_VR
# Define the path where you want to save the DICOM file
save_path = '../data/modified/'
# Create the directory if it doesn't exist
if not os.path.exists(save_path):
    os.makedirs(save_path)

all_files_list = example_dicom_full.file_info.tolist()
all_labels_list = example_dicom_full.label.tolist()
for dicom_file, taglabel in zip(all_files_list, all_labels_list):
    ds = dcmread(dicom_file, no_pixels=False)
    #dictionary_VR([0x0011, 0x1010])
    
    #creator = pydicom.uid.generate_uid()
    # Define a new private tag for hanging protocol information
    tag = pydicom.tag.Tag(0x0011, 0x1010)
    tag2 = pydicom.tag.Tag(0x0011, 0x1011)
   
    
    value = taglabel
    value2 = 0.8
    data_element = pydicom.DataElement(tag, 'IS', value)
    data_element2 = pydicom.DataElement(tag2, 'DS', value2)
   
   
    #Add the new tags to a DICOM dataset
  
    ds.add(data_element)
    ds.add(data_element2)

    # # Create a new data element with the tag, value, and VR
    # value = taglabel
    # data_element = pydicom.DataElement(tag, value = taglabel, VR = 'IS')

    # Set the private creator identifier for the new tag
    #data_element.private_creator = creator

    #
    


    # Define the file name and path for the new DICOM file
    filefolder = '../data/modified/'+dicom_file.split('/')[-2]
    if not os.path.exists(filefolder):
        os.makedirs(filefolder)
    filename = dicom_file.split('/')[-1]
    filepath = os.path.join(filefolder, filename)

    # Save the dataset to a DICOM file
    ds.save_as(filepath)
    
    
    # tag = pydicom.datadict.tag_for_keyword('ML_type')
    # data_element = 
    # if tag is not None:
    #     # Create a new data element with the tag and value
    #     data_element = pydicom.DataElement(tag, value=str(taglabel))
    #     data_element.VR = 'SH'

    # # Add the new tag to the DICOM dataset
    # ds.add(data_element)

    # # tag2 = pydicom.datadict.tag_for_keyword('ML_certainty')
    # # data_element2 = pydicom.DataElement(tag2, value = 0.5, VR='SH')
   
    # # ds.add(data_element)
    # # ds.add(data_element2)

    # ds.save_as('/content/modifiedset/modified_'+dicom_file+'.dcm')

