In [1]:
import pandas as pd

In [2]:
# csv containing feature ratings for each radiologist
inline_ratings_path = '../data/LIDC_20130817_AllFeatures2D_MaxSlicePerNodule_inLineRatings.csv'
# csv containing nodule data for each radiologist
data_path = '../data/LIDC_20130817_AllFeatures2D_MaxSlice_MattEdited.csv'

In [3]:
inline_df = pd.read_csv(inline_ratings_path)
nodule_df = pd.read_csv(data_path)

In [4]:
subtype_features = ["Spiculation", "Malignancy"]
values = [["unrated", "unmarked", "marked", "marked", "marked", "marked"], ["unrated", "benign", "benign", "indeterminate", "malignant", "malignant"]]

In [5]:
subtype_data = {'noduleID': [], 'subtype': []}
for feature in subtype_features:
    subtype_data[feature] = []

# inline_df should have unique nodule ids, so they will work for indexing into the dataframe
inline_df.index = inline_df["noduleID"]

for idx in nodule_df.index:
    nodule_id = nodule_df.at[idx, "noduleID"]
    radiologist_id = nodule_df.at[idx, "RadiologistID"]
    
    # should be 1-dimensional as nodule ids are unique in inline_df
    feature_data = inline_df.loc[nodule_id, [feature + f'_{radiologist_id}' for feature in subtype_features]].values
    
    subtype_string = ""
    for i in range(len(feature_data)):
        subtype_data[subtype_features[i]].append(feature_data[i])
        subtype_string += values[i][feature_data[i]] + '_'
    subtype_string = subtype_string[:-1]
    
    subtype_data["noduleID"].append(nodule_id)
    subtype_data["subtype"].append(subtype_string)

In [6]:
subtype_df = pd.DataFrame(subtype_data)

In [7]:
subtype_df.to_csv('../data/LIDC_allradiologists_spic_subtyped.csv', index=False)