In [1]:
import os
import glob
import json
import cv2
import numpy as np                  
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from tqdm.auto import tqdm
import torchvision.transforms as transforms

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Input
TRAIN_CSV_RSNA_PATH = "./input/train_rsna.csv"

FULL_RSNA_IMG_PATH = "./images_rsna/"
TRAIN_INPUT_IMG_RSNA_CC_TRAIN_PATH = "./images_rsna_cc_train/"
TRAIN_INPUT_IMG_RSNA_CC_TEST_PATH = "./images_rsna_cc_test/"
TRAIN_INPUT_IMG_RSNA_MLO_TRAIN_PATH = "./images_rsna_mlo_train/"
TRAIN_INPUT_IMG_RSNA_MLO_TEST_PATH = "./images_rsna_mlo_test/"


# Output
DATASET_NAME = f"RSNA_CC"
SAVE_FOLDER = f"./output/{DATASET_NAME}"

In [13]:
df_rsna = pd.read_csv(TRAIN_CSV_RSNA_PATH)
df_rsna.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case
0,2,10006,462822612,L,CC,61.0,0,0,0,,0,,29,False
1,2,10006,1459541791,L,MLO,61.0,0,0,0,,0,,29,False
2,2,10006,1864590858,R,MLO,61.0,0,0,0,,0,,29,False
3,2,10006,1874946579,R,CC,61.0,0,0,0,,0,,29,False
4,2,10011,220375232,L,CC,55.0,0,0,0,0.0,0,,21,True


In [14]:
# Add file path column to dataframe
df_rsna["image_path"] = FULL_RSNA_IMG_PATH \
                        + df_rsna.patient_id.map(str) \
                        + "_" + df_rsna.image_id.map(str) \
                        + ".png"
df_rsna["image_path"].head()

0     ./images_rsna/10006_462822612.png
1    ./images_rsna/10006_1459541791.png
2    ./images_rsna/10006_1864590858.png
3    ./images_rsna/10006_1874946579.png
4     ./images_rsna/10011_220375232.png
Name: image_path, dtype: object

In [25]:
new_data = []
for file in os.listdir(TRAIN_INPUT_IMG_RSNA_CC_TRAIN_PATH):
    try:
        int(file[-5])
        source = df_rsna[(df_rsna.image_path == f"{FULL_RSNA_IMG_PATH}{file}")]
    except:
        pass
    source = source.assign(processed_img_path=f"{SAVE_FOLDER}/{file}")
    source = source.assign(train=1)
    new_data.append(source)
df_rsna_train = pd.concat(new_data)
df_rsna_train.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,image_path,processed_img_path,train
89,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False,./images_rsna/10130_1360338805.png,./output/RSNA_CC/10130_1360338805.png,1
89,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False,./images_rsna/10130_1360338805.png,./output/RSNA_CC/10130_1360338805_h_flip.png,1
89,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False,./images_rsna/10130_1360338805.png,./output/RSNA_CC/10130_1360338805_h_flip_rot15...,1
89,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False,./images_rsna/10130_1360338805.png,./output/RSNA_CC/10130_1360338805_h_flip_rot30...,1
89,1,10130,1360338805,L,CC,71.0,1,1,1,0.0,0,B,49,False,./images_rsna/10130_1360338805.png,./output/RSNA_CC/10130_1360338805_h_flip_rot33...,1


In [26]:
len(df_rsna_train)

8960

In [27]:
new_data = []
for file in os.listdir(TRAIN_INPUT_IMG_RSNA_CC_TEST_PATH):
    try:
        int(file[-5])
        source = df_rsna[(df_rsna.image_path == f"{FULL_RSNA_IMG_PATH}{file}")]
    except:
        pass
    source = source.assign(processed_img_path=f"{SAVE_FOLDER}/{file}")
    source = source.assign(train=0)
    new_data.append(source)
df_rsna_test = pd.concat(new_data)
df_rsna_test.head()

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,cancer,biopsy,invasive,BIRADS,implant,density,machine_id,difficult_negative_case,image_path,processed_img_path,train
54,1,10102,1245250349,L,CC,40.0,0,0,0,,0,B,49,False,./images_rsna/10102_1245250349.png,./output/RSNA_CC/10102_1245250349.png,0
57,1,10102,453020471,R,CC,40.0,0,1,0,0.0,0,B,49,True,./images_rsna/10102_453020471.png,./output/RSNA_CC/10102_453020471.png,0
117,1,10151,1827497738,R,CC,55.0,0,0,0,,0,C,49,False,./images_rsna/10151_1827497738.png,./output/RSNA_CC/10151_1827497738.png,0
143,1,10185,1562056186,L,CC,50.0,0,0,0,1.0,0,C,49,False,./images_rsna/10185_1562056186.png,./output/RSNA_CC/10185_1562056186.png,0
142,1,10185,568324157,L,CC,50.0,0,0,0,1.0,0,C,49,False,./images_rsna/10185_568324157.png,./output/RSNA_CC/10185_568324157.png,0


In [28]:
len(df_rsna_test)

2039

In [30]:
df_rsna_full = pd.concat([df_rsna_train, df_rsna_test])
len(df_rsna_full)

10999

In [None]:
df_rsna_full.to_csv("./output/df_rsna_full_cc.csv")