In [None]:
import pickle
import numpy as np
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.model_selection import cross_validate, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, accuracy_score, make_scorer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from pathlib import Path

Load Radiomics Matrix

In [None]:
timepoint = "3month"
radiomics_folder = Path(r"C:\Met Recurrence\RadiomicsMRIsFixed\binWidth0p015N4viaOtsuNoResample0percExpansion3month")
radiomics_path = radiomics_folder / f"{timepoint}_contour_extractions_mat.npy"
radiomics = np.load(radiomics_path)
labels_path = radiomics_folder / f"{timepoint}_labels.npy"
labels = np.load(labels_path)
print(radiomics.shape)
print(labels.shape)

In [None]:
with open(radiomics_folder / f"{timepoint}_feature_names.pkl", 'rb') as f:
    feature_names = pickle.load(f)

with open(radiomics_folder / f"{timepoint}_contour_path_list.pkl", 'rb') as f:
    contour_paths = pickle.load(f)

with open(radiomics_folder / f"{timepoint}_mri_path_list.pkl", 'rb') as f:
    mri_paths = pickle.load(f)

for i in range(6,8):
    print(contour_paths[i])
    print(mri_paths[i])
    print(labels[i])

In [None]:
feature_names[0:10]

Select Patients

In [None]:
# # Select patients by primary tumor type
# histology_df = pd.read_csv(r"C:\Met Recurrence\MRNvsStudyIDvsPrimaryHistology.csv")
# selected_study_ids = set(histology_df['Study ID'][histology_df['Primary_histology'] == 1])
# feature_mat_study_ids = [int(Path(mri_path).parent.name) for mri_path in mri_paths]
# print(f"There are {len(selected_study_ids.intersection(set(feature_mat_study_ids)))} patients with selected tumor type")
# selected_bool_array = [study_id in selected_study_ids for study_id in feature_mat_study_ids]
# radiomics_tumor_type_selected = radiomics[selected_bool_array, :]
# labels_selected = labels[selected_bool_array]
# labels_selected.shape
radiomics_tumor_type_selected = radiomics
labels_selected = labels

Do Feature Selection

In [None]:
# ANOVA feature selection for numeric input and categorical output, (ANOVA with 2 classes is a t-test)
fs = SelectKBest(score_func=f_regression, k=10)
radiomics_feature_selected = fs.fit_transform(radiomics_tumor_type_selected, labels_selected)
radiomics_feature_selected.shape
fs.get_feature_names_out(feature_names)

In [None]:
fs.scores_[fs.get_support()]
# feature_names_np = np.array(feature_names, dtype=object)
# feature_names_np[fs.get_support()]

In [None]:
f_values, ps = f_regression(radiomics_tumor_type_selected, labels_selected)  # get p values
ps[fs.get_support()]

Random Forest

In [None]:
# # Random Forest
# # first get the best hyperparameters
param_grid = {'randomforestclassifier__n_estimators':[10, 100, 200], 'randomforestclassifier__max_depth':[2, 5, None]}
mean_test_score_list = []
for i in range(5):
    radiomics_shuffled, labels_shuffled = shuffle(radiomics_feature_selected, labels_selected)
    pipe = make_pipeline(StandardScaler(), RandomForestClassifier())
    grid = GridSearchCV(pipe, param_grid=param_grid)
    grid.fit(radiomics_shuffled, labels_shuffled)
    mean_test_score_list.append(grid.cv_results_['mean_test_score'])
    print(grid.best_params_)
    print(grid.best_score_)
mean_test_scores = np.stack(mean_test_score_list, axis=1)
best_score_index = np.argmax(np.mean(mean_test_scores, axis=1))
best_n_estimators = grid.cv_results_['param_randomforestclassifier__n_estimators'][best_score_index]
best_max_depth = grid.cv_results_['param_randomforestclassifier__max_depth'][best_score_index]

scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'sensitivity_score': make_scorer(recall_score),
    'specificity_score': make_scorer(recall_score, pos_label=0),
}
# now repeat a 5-fold CV n times with the best hyperparameters
num_tests = 20
all_scores = []
for i in range(num_tests):
    pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth))
    scores = cross_validate(pipe, radiomics_feature_selected, labels_selected, scoring=scorers, cv=StratifiedKFold(shuffle=True))
    all_scores.append(scores)
for score_str in ['accuracy', 'specificity', 'sensitivity']:
    measure = [scores[f'test_{score_str}_score'] for scores in all_scores]
    print(f"{num_tests} x 5-fold CV {score_str} = {np.mean(measure):.3f} +/- {np.std(measure):.3f}")

RBF SVM

In [None]:
# RBF SVM
# first get the best hyperparameters
param_grid = {'svc__gamma':np.logspace(-10, -1, 10), 'svc__C':np.linspace(0.001, 20, 10)}
mean_test_score_list = []
for i in range(5):
    radiomics_shuffled, labels_shuffled = shuffle(radiomics_feature_selected, labels_selected)
    pipe = make_pipeline(StandardScaler(), svm.SVC())
    grid = GridSearchCV(pipe, param_grid=param_grid)
    grid.fit(radiomics_shuffled, labels_shuffled)
    mean_test_score_list.append(grid.cv_results_['mean_test_score'])
    print(grid.best_params_)
mean_test_scores = np.stack(mean_test_score_list, axis=1)
best_score_index = np.argmax(np.mean(mean_test_scores, axis=1))
best_C = grid.cv_results_['param_svc__C'][best_score_index]
best_gamma = grid.cv_results_['param_svc__gamma'][best_score_index]

scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'sensitivity_score': make_scorer(recall_score),
    'specificity_score': make_scorer(recall_score, pos_label=0),
}
# now repeat a 5-fold CV n times with the best hyperparameters
num_tests = 20
all_scores = []
for i in range(num_tests):
    pipe = make_pipeline(StandardScaler(), svm.SVC(C=best_C, gamma=best_gamma))
    scores = cross_validate(pipe, radiomics_feature_selected, labels_selected, scoring=scorers, cv=StratifiedKFold(shuffle=True))
    all_scores.append(scores)
for score_str in ['accuracy', 'specificity', 'sensitivity']:
    measure = [scores[f'test_{score_str}_score'] for scores in all_scores]
    print(f"{num_tests} x 5-fold CV {score_str} = {np.mean(measure):.3f} +/- {np.std(measure):.3f}")

Linear SVM

In [None]:
# Linear SVM
# first get the best hyperparameters
best_Cs = []
for i in range(10):
    radiomics_shuffled, labels_shuffled = shuffle(radiomics_feature_selected, labels_selected)
    pipe = make_pipeline(StandardScaler(), svm.SVC(kernel='linear'))
    grid = GridSearchCV(pipe, param_grid={'svc__C':[0.0001, 0.001, 0.0025, 0.005, 0.01, 0.1, 1]})
    grid.fit(radiomics_shuffled, labels_shuffled)
    best_score_index = np.argmax(grid.cv_results_['mean_test_score'])
    best_Cs.append(grid.cv_results_['params'][best_score_index]['svc__C'])
best_C = max(best_Cs, key=best_Cs.count)  # most frequent best_C

scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'sensitivity_score': make_scorer(recall_score),
    'specificity_score': make_scorer(recall_score, pos_label=0),
}
# now repeat a 5-fold CV n times with the best hyperparameters
num_tests = 20
all_scores = []
for i in range(num_tests):
    pipe = make_pipeline(StandardScaler(), svm.SVC(C=best_C, kernel='linear'))
    scores = cross_validate(pipe, radiomics_feature_selected, labels_selected, scoring=scorers, cv=StratifiedKFold(shuffle=True))
    all_scores.append(scores)
for score_str in ['accuracy', 'specificity', 'sensitivity']:
    measure = [scores[f'test_{score_str}_score'] for scores in all_scores]
    print(f"{num_tests} x 5-fold CV {score_str} = {np.mean(measure):.3f} +/- {np.std(measure):.3f}")

Multivariate Logistic Regression

In [None]:
scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'sensitivity_score': make_scorer(recall_score),
    'specificity_score': make_scorer(recall_score, pos_label=0),
}
pipe = make_pipeline(StandardScaler(), LogisticRegression())
radiomics_shuffled, labels_shuffled = shuffle(radiomics_feature_selected, labels_selected)
scores = cross_validate(pipe, radiomics_shuffled, labels_shuffled, scoring=scorers)
print(f"accuracy: {scores['test_accuracy_score'].mean():.3f} +/- {scores['test_accuracy_score'].std():.3f}")

AdaBoost

In [None]:
scorers = {
    'accuracy_score': make_scorer(accuracy_score),
    'sensitivity_score': make_scorer(recall_score),
    'specificity_score': make_scorer(recall_score, pos_label=0),
}
pipe = make_pipeline(StandardScaler(), AdaBoostClassifier())
scores = cross_validate(pipe, radiomics_feature_selected, labels_selected, scoring=scorers, cv=StratifiedKFold(shuffle=True))
print(f"accuracy: {scores['test_accuracy_score'].mean():.3f} +/- {scores['test_accuracy_score'].std():.3f}")

Save Best Linear SVM

In [None]:
# Train and save best linear SVM
pipe = make_pipeline(StandardScaler(), svm.SVC(C=best_C, kernel='linear'))
radiomics_no0004 = radiomics_feature_selected[8:]
labels_no0004 = labels_selected[8:]
radiomics_no0004_shuffled, labels_no0004_shuffled = shuffle(radiomics_no0004, labels_no0004)
pipe.fit(radiomics_no0004_shuffled, labels_no0004_shuffled)

# with open(radiomics_folder / "linear_svc_3monthOnly_0004complement.pkl", 'wb') as f:
#     pickle.dump(pipe, f)