In [1]:
import pandas as pd
import numpy as np
import os
import glob
import pydicom
from pandasql import sqldf
import random
import lightgbm as lgb

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
def get_img_dict(patient_id,info_dict):
    patient_info = info_dict[str(patient_id)]
    img_dict = {}
    for idx,i in enumerate(patient_info['SeriesInstanceUIDs']):
        img_dict[i] = {'images': [],'description': patient_info['SeriesDescriptions'][idx]}
        images = glob.glob(f"{patient_info['folder_path']}/{patient_info['SeriesInstanceUIDs'][idx]}/*.dcm")
        for j in sorted(images,key=lambda x: int(x.replace('\\','/').split('/')[-1].replace('.dcm',''))):
            img_dict[i]['images'].append({'SOPInstanceUID': j.replace('\\','/').split('/')[-1].replace('.dcm',''),'dicom': pydicom.dcmread(j)})
    return img_dict

# https://www.kaggle.com/code/dschettler8845/isic-detect-skin-cancer-let-s-learn-together
def fill_nulls(df):
    num_cols = df.select_dtypes(include=['int64','float64']).columns
    num_imputer = SimpleImputer(strategy='median')
    df[num_cols] = num_imputer.fit_transform(df[num_cols])
    cat_cols = df.select_dtypes(include=['object']).columns
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
    return df

# https://datascience.stackexchange.com/questions/70298/labelencoding-selected-columns-in-a-dataframe-using-for-loop
def label_encode(df,cols):
    le = LabelEncoder()
    df[cols] = le.fit_transform(df[cols])
    return df

def oh_encode(df,cols):
    encoder = OneHotEncoder(sparse_output=False,handle_unknown='ignore')
    encoded_cols = encoder.fit_transform(df[cols])
    col_names = encoder.get_feature_names_out(cols)
    encoded_df = pd.DataFrame(encoded_cols,columns=col_names,index=df.index)
    combined_df = pd.concat([df.drop(cols,axis=1),encoded_df],axis=1)
    return combined_df

def standardize(df,cols):
    scaler = StandardScaler()
    df[cols] = scaler.fit_transform(df[cols])
    return df

def preprocess_df(df,le_cols,oh_cols,num_cols):
    df = fill_nulls(df)
    df = label_encode(df,le_cols)
    df = oh_encode(df,oh_cols)
    df = standardize(df,num_cols)
    return df

def generate_lgbm_params(random_seed=22):
    random.seed(random_seed)
    params = {}
    params['objective'] = 'multiclass'
    params['num_class'] = 4
    params['random_state'] = random_seed
    params['bagging_freq'] = 1
    params['verbosity'] = -1
    params['max_depth'] = 1
    params['min_child_samples'] = 2
    params['n_estimators'] = random.uniform(1400, 2400)
    params['learning_rate'] = random.uniform(0.001, 0.003)
    params['num_leaves'] = random.uniform(16, 40)
    params['min_data_in_leaf'] = random.uniform(16, 60)
    params['pos_bagging_fraction'] = random.uniform(0.74, 0.78)
    params['neg_bagging_fraction'] = random.uniform(0.04, 0.08)
    params['feature_fraction'] = random.uniform(0.5, 0.78)
    params['lambda_l1'] = random.uniform(0.1, 0.4)
    params['lambda_l2'] = random.uniform(0.7, 3.0)
    return params

def preprocess_img(img,targ_x,targ_y,output_width=64,output_height=64):
    start_x = max(0,int(targ_x - output_width / 2))
    end_x = min(img.shape[1],int(targ_x + output_width / 2))
    start_y = max(0,int(targ_y - output_height / 2))
    end_y = min(img.shape[0],int(targ_y + output_height / 2))
    img = img[start_y:end_y,start_x:end_x]
    lower, upper = np.percentile(img,(1,99))
    img = np.clip(img,lower,upper)
    img = img - np.min(img)
    img = img / np.max(img)
    img = (img*255).astype("uint8")
    img = cv2.resize(img,(output_width,output_height))
    return img

In [3]:
train_df = pd.read_csv('../data/train.csv')
desc_df = pd.read_csv('../data/train_series_descriptions.csv')
coor_df = pd.read_csv('../data/train_label_coordinates.csv')

train_df = fill_nulls(train_df)

In [4]:
id_path_list = [(x,f'../data/train_images/{x}') for x in os.listdir('../data/train_images')]
id_info_dict = {p[0]: {'folder_path': p[1],
                       'SeriesInstanceUIDs': os.listdir(p[1]),
                       'SeriesDescriptions': []}
                for p in id_path_list}

for k in id_info_dict:
    for s in id_info_dict[k]['SeriesInstanceUIDs']:
        id_info_dict[k]['SeriesDescriptions'].append(desc_df[(desc_df['study_id'] == int(k)) &
            (desc_df['series_id'] == int(s))]['series_description'].iloc[0])

display(id_info_dict[list(id_info_dict.keys())[0]])

{'folder_path': '../data/train_images/10728036',
 'SeriesInstanceUIDs': ['142859125', '2073726394', '2399638375', '3491739931'],
 'SeriesDescriptions': ['Axial T2',
  'Axial T2',
  'Sagittal T1',
  'Sagittal T2/STIR']}

In [5]:
# https://www.datacamp.com/tutorial/how-to-use-sql-in-pandas-using-pandasql-queries
# https://medium.com/@davidfagb/using-sql-with-pandas-dataframes-1c36f57ea65d
col_str = str(list(set(list(train_df.columns) + list(desc_df.columns) + list(coor_df.columns)))).replace('[','').replace(']','').replace('\'','')
query_cols = col_str.replace('study_id','t.study_id').replace('series_id','d.series_id')
valid_study_ids = str(list(id_info_dict.keys())).replace('[','(').replace(']',')').replace('\'','')
query = f'''SELECT {query_cols}
            FROM train_df t
            JOIN desc_df d ON t.study_id = d.study_id
            JOIN coor_df c ON d.series_id = c.series_id
            WHERE t.study_id IN {valid_study_ids}'''
joined_df = sqldf(query)
display(joined_df.head(1))
display(joined_df.shape)

Unnamed: 0,right_subarticular_stenosis_l1_l2,spinal_canal_stenosis_l3_l4,right_neural_foraminal_narrowing_l3_l4,right_subarticular_stenosis_l2_l3,left_subarticular_stenosis_l5_s1,left_neural_foraminal_narrowing_l1_l2,left_neural_foraminal_narrowing_l2_l3,series_description,spinal_canal_stenosis_l5_s1,right_subarticular_stenosis_l4_l5,...,right_neural_foraminal_narrowing_l4_l5,spinal_canal_stenosis_l4_l5,left_subarticular_stenosis_l4_l5,condition,left_subarticular_stenosis_l1_l2,left_neural_foraminal_narrowing_l3_l4,left_subarticular_stenosis_l2_l3,left_neural_foraminal_narrowing_l5_s1,right_subarticular_stenosis_l5_s1,spinal_canal_stenosis_l1_l2
0,Normal/Mild,Normal/Mild,Moderate,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Sagittal T2/STIR,Normal/Mild,Normal/Mild,...,Moderate,Normal/Mild,Moderate,Spinal Canal Stenosis,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild,Normal/Mild


(617, 33)

In [6]:
label_dict = {}
counter = 0
for k in id_info_dict:
    d = get_img_dict(k,id_info_dict)
    for i in id_info_dict[str(k)]['SeriesInstanceUIDs']:
        temp_df = joined_df[(joined_df['study_id'] == int(k)) & (joined_df['series_id'] == int(i))]
        instances = list(temp_df['instance_number'])
        for j in d[i]['images']:
            if int(j['SOPInstanceUID']) in instances:
                temp_df_2 = temp_df[temp_df['instance_number'] == int(j['SOPInstanceUID'])]
                for idc, c in temp_df_2.iterrows():
                    plane = c['series_description']
                    condition = c['condition']
                    location = c['level']
                    severity = c[f'{condition.lower().replace(' ','_')}_{location.lower().replace('/','_')}']
                    label_dict[counter] = [severity] + [plane] + [condition] + [location] + list(resize(j['dicom'].pixel_array,(16,16)).flatten())
                    counter = counter + 1

col_names = ['label','plane','condition','location'] + [str(x) for x in list(range(16*16))]
df = pd.DataFrame.from_dict(label_dict,orient='index',columns=col_names)
display(df.head(1))
display(df.shape)

Unnamed: 0,label,plane,condition,location,0,1,2,3,4,5,...,246,247,248,249,250,251,252,253,254,255
0,Normal/Mild,Axial T2,Left Subarticular Stenosis,L1/L2,0.000918,0.001447,0.001483,0.001406,0.0013,0.001198,...,0.00088,0.000856,0.000899,0.000894,0.000892,0.000882,0.000878,0.000869,0.000878,0.000799


(617, 260)

In [7]:
df = preprocess_df(df,list(df.columns)[0],list(df.columns)[1:4],[str(x) for x in list(df.columns)[4:]])
display(df.head(5))
display(df.shape)

Unnamed: 0,label,0,1,2,3,4,5,6,7,8,...,condition_Left Neural Foraminal Narrowing,condition_Left Subarticular Stenosis,condition_Right Neural Foraminal Narrowing,condition_Right Subarticular Stenosis,condition_Spinal Canal Stenosis,location_L1/L2,location_L2/L3,location_L3/L4,location_L4/L5,location_L5/S1
0,1,-0.15438,-0.044282,-0.038475,-0.07203,-0.12055,-0.200906,-0.290914,-0.347303,-0.404682,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,-0.191468,-0.106119,-0.101068,-0.151066,-0.191663,-0.24371,-0.307365,-0.34706,-0.396173,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1,-0.164684,-0.103773,-0.085485,-0.123764,-0.160831,-0.227055,-0.314144,-0.344194,-0.377653,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1,-0.05223,-0.074691,-0.073493,-0.087907,-0.135527,-0.212421,-0.260433,-0.309829,-0.371832,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1,0.047641,0.079656,0.10697,0.136847,0.106956,-0.039117,-0.161913,-0.143267,-0.18238,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


(617, 270)

In [8]:
targ_col = 'label'
X = df.drop(targ_col,axis=1)
Y = df[targ_col]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=22)

In [9]:
model = lgb.LGBMClassifier(num_leaves=4,max_depth=2,min_child_samples=4)
model.fit(X_train,Y_train)

preds = model.predict(X_test)
display(preds)

acc = accuracy_score(Y_test,preds)
display(acc)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007352 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 39543
[LightGBM] [Info] Number of data points in the train set: 493, number of used features: 269
[LightGBM] [Info] Start training from score -1.605389
[LightGBM] [Info] Start training from score -0.289713
[LightGBM] [Info] Start training from score -2.981633


array([1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 2,
       1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0])

0.782258064516129

In [10]:
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
# https://learn.microsoft.com/en-us/windows/ai/windows-ml/tutorials/pytorch-train-model
# https://medium.com/@golnaz.hosseini/beginner-tutorial-image-classification-using-pytorch-63f30dcc071c

# https://keras.io/examples/
# https://www.analyticsvidhya.com/blog/2021/06/image-processing-using-cnn-a-beginners-guide/

# https://paperswithcode.com/task/medical-image-classification