<h1 style="text-align: center;">Count Image Types</h1>

By counting images, we discover several issues:
- 86/120 patients with images
- inconsistent image names for lasa and vstrain

In [2]:
import os
import pandas as pd

In [20]:
GLOBAL_PATH = '/Users/niksun/Desktop/BCM_TCH_Cardiomyopathy_F23'
RAW_DATA_PATH = os.path.join(GLOBAL_PATH, 'raw_data')
RAW_IMAGE_PATH = os.path.join(GLOBAL_PATH, 'raw_data', 'D2K - no DICOM')
OUTPUT_PATH = os.path.join(GLOBAL_PATH, 'output')

## Part 1: Detailed Count and Grouped Count

In [21]:
# Study Key is patien id, Diagnosis is patient diagnosis result
df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'd2k_key.csv'), usecols=['Study Key', 'Diagnosis'])
df.head()

Unnamed: 0,Study Key,Diagnosis
0,DDCM-001,HCM
1,DDCM-002,HCM
2,DDCM-003,HCM
3,DDCM-004,HCM
4,DDCM-005,HCM


In [22]:
# get all image names
def get_distinct_image_names(file_dir):
    image_names = set()
    for file_name in os.listdir(file_dir):
        if os.path.isdir(os.path.join(file_dir, file_name)):
            image_names.update(get_distinct_image_names(os.path.join(file_dir, file_name)))
        else:
            if file_name.lower().endswith('.jpg'):
                image_names.add(file_name)

    return image_names

image_names = get_distinct_image_names(RAW_IMAGE_PATH)
print(len(image_names))
print(image_names)

16
{'lasa2c.jpg', 'vstrain2c.jpg', 'vstrainap2.jpg', 'lasap4.jpg', 'vstrain4c.jpg', 'tdimed.jpg', 'vstraina4c.jpg', 'trjet.jpg', 'pulmvein.jpg', 'lasa4c.jpg', 'vstrainap4.jpg', 'mv.jpg', 'bullet.jpg', 'tdilat.jpg', 'vstrainap3.jpg', 'vstraina2c.jpg'}


In [23]:
# initialize all image counts to 0
new_columns = ['bullet', 'lasa2c', 'lasa4c', 'lasap4', 'mv', 'pulmvein', 'tdilat',
               'tdimed', 'trjet', 'vstraina2c', 'vstraina4c', 'vstrainap2',
               'vstrainap3', 'vstrainap4', 'vstrain2c', 'vstrain4c']

for col in new_columns:
    df[col] = 0

df['total'] = 0

df.tail()

Unnamed: 0,Study Key,Diagnosis,bullet,lasa2c,lasa4c,lasap4,mv,pulmvein,tdilat,tdimed,trjet,vstraina2c,vstraina4c,vstrainap2,vstrainap3,vstrainap4,vstrain2c,vstrain4c,total
115,DDCM-116,SVLV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
116,DDCM-117,SVLV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
117,DDCM-118,SVLV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
118,DDCM-119,SVLV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
119,DDCM-120,SVLV,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
patient_folders = os.listdir(RAW_IMAGE_PATH)
patient_folders = sorted([folder for folder in patient_folders if folder.startswith('DDCM-')])
print(len(patient_folders)) # 86 patients

folder_paths = [os.path.join(RAW_IMAGE_PATH, folder) for folder in patient_folders]
print(folder_paths[0])

86
/Users/niksun/Desktop/BCM_TCH_Cardiomyopathy_F23/raw_data/D2K - no DICOM/DDCM-001


In [25]:
# count if the patient has the image and each image
for folder_path in folder_paths:
    row_idx = folder_path[-3:]
    row_idx = int(row_idx)
    # print(row_idx)

    df.at[row_idx-1, 'total'] = 1

    files = os.listdir(folder_path)
    files = [file[:-4] for file in files if file.lower().endswith('.jpg')]

    if files != []:
        for file in files:
            df.at[row_idx-1, file] = 1

In [26]:
df.to_csv(os.path.join(OUTPUT_PATH, 'image_count_detail.csv'), index = False)

In [27]:
grouped_df = df.groupby(df['Diagnosis']).sum()
grouped_df.head()

  grouped_df = df.groupby(df['Diagnosis']).sum()


Unnamed: 0_level_0,bullet,lasa2c,lasa4c,lasap4,mv,pulmvein,tdilat,tdimed,trjet,vstraina2c,vstraina4c,vstrainap2,vstrainap3,vstrainap4,vstrain2c,vstrain4c,total
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Control,25,29,28,1,28,29,27,27,29,3,3,0,0,1,0,0,29
HCM,28,30,32,0,32,20,25,25,31,1,2,1,1,1,1,1,32
SVLV,0,3,11,0,11,9,3,3,0,3,12,0,0,0,0,0,12
SVRV,0,2,12,0,13,11,0,0,0,3,13,0,0,0,0,0,13


In [28]:
grouped_df.to_csv(os.path.join(OUTPUT_PATH, 'image_count_group.csv'), index = True)

## Part 2: Standardize Image Names and Count Again

**need further clean**
- comments
- check `cardio_module > ImageTypes` for consistent image names

In [28]:
# Create a new df to contain the cleaned data
final_columns = ['Study Key', 'Diagnosis', 'lasa2c', 'lasa4c', 'mv', 'pulmvein', 'tdilat',
               'tdimed', 'trjet', 'vstraina2c', 'vstraina4c', 'vstrainap3','total']
final_df = df.loc[:, final_columns]

# change feature values to clean the image type namings
# “lasa*c” and “lasap*” represent the same image type if the number that replaces
# the asterisk is the same

final_df['lasa4c'] = df['lasa4c'] + df['lasap4']

# “vstraina*c,” “vstrainap*,” and “vstrain*c” represent the same image type if the
# number that replaces the asterisk is the same
# Plus, a “bullet” image is a summary of vstraina2c/3c/4c images.
final_df['vstraina2c'] = df['vstraina2c'] + df['bullet'] + df['vstrainap2'] + df['vstrain2c']
final_df['vstraina4c'] = df['vstraina4c'] + df['bullet'] + df['vstrainap4'] + df['vstrain4c']
final_df['vstrainap3'] = df['vstrainap3'] + df['bullet']

# rename column
final_df = final_df.rename(columns={'vstrainap3': 'vstraina3c'})

In [29]:
# drop the SV classes
final_df = final_df[final_df['Diagnosis'] != "SVLV"]
final_df = final_df[final_df['Diagnosis'] != "SVRV"]

In [30]:
# Group by patients' Diagnosis, and sum the image name columns for each
# Diagnosis group

# final_grouped = final_df.groupby(final_df['Diagnosis']).sum()
# print(final_grouped)

In [31]:
# drop patients who have any missing data among the current image types
# but save those patients in another.
partial_data_df = final_df[(final_df == 0).any(axis=1)]
partial_data_df = partial_data_df[(partial_data_df == 1).any(axis=1)]

final_df = final_df[(final_df != 0).all(axis=1)]

In [32]:
# Group by patients' Diagnosis, and sum the image name columns for each
# Diagnosis group
final_grouped = final_df.groupby(final_df['Diagnosis']).sum()

print(final_grouped)

           lasa2c  lasa4c  mv  pulmvein  tdilat  tdimed  trjet  vstraina2c  \
Diagnosis                                                                    
Control        23      23  23        23      23      23     23          23   
HCM            13      13  13        13      13      13     13          13   

           vstraina4c  vstraina3c  total  
Diagnosis                                 
Control            23          23     23  
HCM                13          13     13  


  final_grouped = final_df.groupby(final_df['Diagnosis']).sum()


In [34]:
final_grouped.to_csv(os.path.join(OUTPUT_PATH, 'image_count_group_clean.csv'), index = True)