In [18]:
import pandas as pd

UCLA_CSV_PATH = '/radraid2/dongwoolee/VNet_LungNoduleSeg/richard/data/UCLAIDx_Task78_Lesion_info.csv'
NLST_CSV_PATH = '/radraid2/dongwoolee/VNet_LungNoduleSeg/richard/data/NLST_merged_lesion_info.csv'
LIDC_CSV_PATH = '/radraid2/dongwoolee/VNet_LungNoduleSeg/richard/data/LIDC_lesion_info_maybe.csv'

NODULE_LIST_PATH = '/radraid2/dongwoolee/VNet_LungNoduleSeg/richard/data/images_and_segpaths_2024-12-11-checkpoint.csv'

ucla_csv = pd.read_csv(UCLA_CSV_PATH, index_col=0)
nlst_csv = pd.read_csv(NLST_CSV_PATH)
lidc_csv = pd.read_csv(LIDC_CSV_PATH, index_col=0)

nodule_list = pd.read_csv(NODULE_LIST_PATH)

ucla_features = ["patient_id", "nodule_id", "dr_diaag_code", "nodule_category", "consistency", 
                 "location", "margins", "suspicion", "diam_long", "diam_small"]
nlst_features = ["pid", "nodule_id", "level_of_suspicion_of_lung_cancer", 
                 "slice_thickness", "kernel", "scanner_manufacturer", "scanner_model",  "axial_location",
                 "longest_axial_diameter_(mm)", "short_diameter_(mm)", "nodule_margin_conspicuity", "nodule_margins",
                 "additional_nodule_margins", "nodule_shape", "nodule_consistency", "cyst-like_spaces",
                 "pleural_attachment", "pleural_retraction", "vascular_convergence", "septal_stretching",
                 "paracicatricial_emphysema", "predominant_nature_of_lung_parenchyma", "fibrosis",
                 "fibrosis_distribution"]
lidc_features = ["pid", "nodule_id", "sphericity", "lobulation", "texture", "margin", "spiculation", "malignancy",
                 "diameter", "internalStructure", "calcification"]
nodule_list_features = ["pid", "texture", "scanner", "study_desc"]

In [19]:
def combine_dice_score(original_df, dice_df, filename_col, map_dict):
    """
    Combines the dice score from dice_df into original_df using a mapping.

    Args:
        original_df (pd.DataFrame): The main DataFrame, must contain a 'pid' column.
        dice_df (pd.DataFrame): DataFrame with dice scores, must contain 'filename_col' and 'dice_score'.
        filename_col (str): The name of the column in dice_df that contains the filenames (the keys of map_dict).
        map_dict (dict): A dictionary mapping values from dice_df[filename_col] to 'pid' values in original_df.
                          Example: {'dice_df_filename_value': 'original_df_pid_value'}
    """
    # Make copies of the dataframes to avoid modifying the original ones
    original_df_copy = original_df.copy()
    dice_df_copy = dice_df.copy()

    # Map the filename to the pid
    dice_df_copy['pid'] = dice_df_copy[filename_col].map(map_dict)
    missing_maps = dice_df_copy['pid'].isnull().sum()

    # Check if any filenames could not be mapped
    if missing_maps > 0:
        print(f"Warning: {missing_maps} filenames in '{filename_col}' column of dice_df could not be mapped to a 'pid' using map_dict.")

    # Merge the dice score into the original dataframe
    dice_df_to_merge = dice_df_copy[['pid', 'dice_score']]
    merged_df = original_df_copy.merge(dice_df_to_merge, on='pid', how='left')

    return merged_df

## Value Counts

### UCLA

In [20]:
### UCLA
ucla_value_counts = {}
for feature in ucla_features:
    if feature not in ["patient_id", "nodule_id", "diam_long", "diam_small"]:
        ucla_value_counts[feature] = ucla_csv[feature].value_counts(dropna=False)
        print(ucla_value_counts[feature])
        print()

### NLST
nlst_value_counts = {}
for feature in nlst_features:
    if feature not in ["pid", "nodule_id", "longest_axial_diameter_(mm)", "short_diameter_(mm)"]:
        nlst_value_counts[feature] = nlst_csv[feature].value_counts(dropna=False)
        print(nlst_value_counts[feature])
        print()

### LIDC
lidc_value_counts = {}
for feature in lidc_features:
    if feature not in ["pid", "nodule_id", "diameter"]:
        lidc_value_counts[feature] = lidc_csv[feature].value_counts(dropna=False)
        print(lidc_value_counts[feature])
        print()

### Nodule List
nodule_list_value_counts = {}
for feature in nodule_list_features:
    if feature not in ["pid"]:
        nodule_list_value_counts[feature] = nodule_list[feature].value_counts(dropna=False)
        print(nodule_list_value_counts[feature])
        print()

dr_diaag_code
Lung Procedure - Core Needle Biopsy    662
Lung Diagnostic Imaging                394
Lung Screening LDCT                     38
Lung Non-Screening LDCT                 34
Lung Pathology - Biopsy                 15
NaN                                      5
Name: count, dtype: int64

nodule_category
Principal Nodule        1007
NaN                       59
Category 3-4x Nodule      45
Other Nodule              31
Category 2 Nodule          6
Name: count, dtype: int64

consistency
Solid              738
PSN                179
Unknown            101
Pure GGN            83
Cavitary            23
Cyst-associated     10
NaN                 10
Not specified        4
Name: count, dtype: int64

location
RUL = Right Upper Lobe     294
RLL = Right Lower Lobe     269
LUL = Left Upper Lobe      237
LLL = Left Lower Lobe      215
RML = Right Middle Lobe     92
Lingula                     20
NaN                          9
Other                        7
Mediastinum                  3
Ri