In [2]:
import numpy as np
import cv2
import os 
from sklearn.ensemble import IsolationForest
import pandas as pd
from sklearn.metrics import roc_curve
from sklearn import metrics


In [3]:
#create features
def quantify_image(image):
	hist =cv2.calcHist([image], [0], mask=None, histSize=[6], ranges=[0, 256])
	hist = cv2.normalize(hist, hist).flatten()
	return hist

In [4]:
#load dataset
def load_dataset(datasetPath, indexes):
    imagePaths = os.listdir(datasetPath)
    data = []
    string=''
    for i in indexes:
      if len(str(i)) == 1:
          string = '000' + str(i) + '.npy'
      elif len(str(i)) == 2:
          string = '00' + str(i)+ '.npy'
      elif len(str(i)) == 3:
          string = '0' + str(i)+ '.npy'
      elif len(str(i)) == 4:
          string = str(i)+ '.npy'
      image = np.load(datasetPath+ string)
      features = quantify_image(image)
      data.append(features)
    return np.array(data)

In [27]:
#load in metadata and get indexes of training cases
meta = pd.read_csv('/content/gdrive/MyDrive/siamese/Github/metadata.csv') 
indexes = meta.loc[meta['iso_set']==1].index.values

In [7]:
#load the train dataset
train = load_dataset('/content/gdrive/MyDrive/MRNet/data/train/sagittal/', indexes)


In [8]:
#train the model
model = IsolationForest(n_estimators=100, contamination=0.01,
	random_state=42)
model.fit(train)

IsolationForest(behaviour='deprecated', bootstrap=False, contamination=0.01,
                max_features=1.0, max_samples='auto', n_estimators=100,
                n_jobs=None, random_state=42, verbose=0, warm_start=False)

In [29]:
#get index of test set that are present in the official mrnet training data
test_ind = meta.loc[(meta['test']==1) & (meta['mrnet_split'] == 0), 'id']
#create features for test data in the official train set
test_p1 = load_dataset('/content/gdrive/MyDrive/MRNet/data/train/sagittal/', test_ind)
#create features for test data in the official validation set
test_p2 = load_dataset('/content/gdrive/MyDrive/MRNet/data/valid/sagittal/', list(meta.loc[(meta['test']==1) & (meta['mrnet_split'] == 1), 'id']))
#merge the features for the test data
test_all = np.zeros((len(meta.loc[meta['test']==1]), test_p1.shape[1]))
test_all[:test_p1.shape[0],:] = test_p1
test_all[test_p1.shape[0]:,:] = test_p2

In [30]:
#predict no test set
preds=[]
for ind in range(0, len(test_all)):
  preds.append(model.score_samples([test_all[ind]])[0] * -1)
  
#create dataframe with MRI id and probability
p =pd.concat([meta.loc[meta['test']==1, 'id'].reset_index(drop=True), pd.DataFrame(preds)], axis =1)
p.columns=['id','pred']

In [31]:
#get labels 
p=p.merge( meta[['id', 'label']], on='id', how='left')

In [25]:
#calculate AUC 
fpr, tpr, thresholds = roc_curve(p['label'],p['pred'])
metrics.auc(fpr, tpr)

0.8017174082747853