In [None]:
def binner(y, class1):
    '''
    Transform y_label to binary
    '''
    y_bin = []
    for val in y:
        if val == class1:
            y_bin.append(0)
        else:
            y_bin.append(1) 
    return y_bin
def plot_roc(y_score, y_truth, output_dir, title):
    '''
    Plot an ROC curve.
    '''
    # Only take scores for class = 1
    y_score = y_score[:, 1]
    
    # Compute ROC curve and ROC area for each class
    fpr, tpr, _ = metrics.roc_curve(y_truth, y_score)
    roc_auc = metrics.auc(fpr, tpr)     
    
    # Plot the ROC curve
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    plt.savefig(output_dir)
def eval_func(features, label):
    '''
    Entire script for evaluation set up. 
    Note: Same code as in next sections!
    '''
    X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=.2, stratify=label)
    
    y_train_bin = binner(y_train, 'T12') # make binary y_score
    y_test_bin = binner(y_test, 'T12')

# Outlier detection
    for feature in X_train.columns:
        Q1 = np.percentile(X_train[feature], 25, interpolation = 'midpoint') # Defining quartiles
        Q3 = np.percentile(X_train[feature], 75, interpolation = 'midpoint')
        IQR = Q3 - Q1
        if not IQR == 0: # If 0, all values in range, so no adjusting necessary
            X_train.loc[X_train[feature] > (Q3+1.5*IQR),feature] = Q3 # Upper bound
            X_train.loc[X_train[feature] < (Q1-1.5*IQR),feature] = Q1 # Lower bound

    for feature in X_test.columns:
        Q1 = np.percentile(X_test[feature], 25, interpolation = 'midpoint')
        Q3 = np.percentile(X_test[feature], 75, interpolation = 'midpoint')
        IQR = Q3 - Q1
        if not IQR == 0:
            X_test.loc[X_test[feature] > (Q3+1.5*IQR),feature] = Q3 
            X_test.loc[X_test[feature] < (Q1-1.5*IQR),feature] = Q1

    # Scaler
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns = features.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns = features.columns)

    coefs = []
    accuracies = []
    times = []
    n_alphas = 100
    alphas = np.logspace(-7, -1, n_alphas)

    for a in alphas:
        # Fit classifier
        clf = Lasso(alpha=a, fit_intercept=False,tol=0.044)
        clf.fit(X_train_scaled, y_train_bin) 
        y_pred = clf.predict(X_test_scaled)
        
        # Append statistics
        accuracy = clf.score(X_train_scaled,y_train_bin)
        accuracies.append(accuracy)
        coefs.append(clf.coef_)

    selector = SelectFromModel(estimator=Lasso(alpha=10**(-6), tol=0.002248888888888889), threshold='median')
    selector.fit(X_train_scaled, y_train_bin)
    n_original = X_train_scaled.shape[1]
    X_train_fs = selector.transform(X_train_scaled)
    X_test_fs = selector.transform(X_test_scaled)
    n_selected = X_train_fs.shape[1]

    N_COMP = .9 # Capture components at 90% of the variance
    pca = PCA(n_components=N_COMP)
    pca.fit(X_train_fs)
    X_train_pca = pca.transform(X_train_fs)
    X_test_pca = pca.transform(X_test_fs)

    # SVC
    # Create a 6 fold stratified CV iterator
    cv_6fold = StratifiedKFold(n_splits=6)
    results = []
    best_cls = []
    y_train_a = y_train.to_numpy()

    # Loop over the folds
    for train_opt_index, validation_index in cv_6fold.split(X_train_pca,y_train_a):
        # Split the data properly
        X_train_opt = X_train_pca[train_opt_index]
        y_train_opt = y_train_a[train_opt_index]
        
        X_validation = X_train_pca[validation_index]
        y_validation = y_train_a[validation_index]
        
        # Create a grid search to find the optimal k using a gridsearch and 3-fold cross validation
        # Same as above
        parameters = {
            'C':list(np.linspace(0.01,1,100)),
        }
        svm_clf = SVC(kernel = 'linear', probability=True)
        cv_3fold = StratifiedKFold(n_splits=3)
        grid_search = GridSearchCV(svm_clf, parameters, cv=cv_3fold, scoring='roc_auc')
        grid_search.fit(X_train_opt, y_train_opt)
        
        # Get resulting classifier
        clf = grid_search.best_estimator_

        best_cls.append(clf.C)
        
        # Test the classifier on the train_opt data
        probabilities_train_opt = clf.predict_proba(X_train_opt)
        scores_train_opt = probabilities_train_opt[:, 1]
        
        # Get the auc
        auc_train_opt = metrics.roc_auc_score(y_train_opt, scores_train_opt)
        results.append({
            'AUC': auc_train_opt,
            'n': clf.C,
            'Set': 'Train'
        })

        # Test the classifier on the test data
        probabilities = clf.predict_proba(X_validation)
        scores = probabilities[:, 1]
        
        # Get the auc
        auc = metrics.roc_auc_score(y_validation, scores)
        results.append({
            'AUC': auc,
            'n': clf.C,
            'Set': 'Validation'
        })
        

    optimal_c = float(np.mean(best_cls))


        # Use the optimal parameters without any tuning to validate the optimal classifier
    clf = SVC(kernel = 'linear', probability=True, C=optimal_c)
    # Fit on the entire dataset
    clf.fit(X_train_pca, y_train)

    # Test the classifier on the indepedent test data
    probabilities_train = clf.predict_proba(X_train_pca)
    probabilities_test = clf.predict_proba(X_test_pca)
    scores_train = probabilities_train[:, 1]
    scores_test = probabilities_test[:, 1]

    # Get the auc, sensitivity and specificity
    auc_svc_train = metrics.roc_auc_score(y_train, scores_train)
    auc_svc_test = metrics.roc_auc_score(y_test, scores_test)
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, clf.predict(X_test_pca)).ravel()
    specificity_svc = tn / (tn+fp)
    sensitivity_svc = tp/(tp+fn)


    # KNN
    # Create a 6 fold stratified CV iterator
    cv_6fold = StratifiedKFold(n_splits=6)
    results = []
    best_cls = []
    y_train_a = y_train.to_numpy()

    # Loop over the folds
    for train_opt_index, validation_index in cv_6fold.split(X_train_pca,y_train_a):
        # Split the data properly
        X_train_opt = X_train_pca[train_opt_index]
        y_train_opt = y_train_a[train_opt_index]
        
        X_validation = X_train_pca[validation_index]
        y_validation = y_train_a[validation_index]
        
        # Create a grid search to find the optimal k using a gridsearch and 3-fold cross validation
        # Same as above
        parameters = {
            'n_neighbors': list(range(1,31)),
        }
        knn_clf = KNeighborsClassifier(weights='distance')
        cv_3fold = StratifiedKFold(n_splits=3)
        grid_search = GridSearchCV(knn_clf, parameters, cv=cv_3fold, scoring='roc_auc')
        grid_search.fit(X_train_opt, y_train_opt)
        
        # Get resulting classifier
        clf = grid_search.best_estimator_

        best_cls.append(clf.n_neighbors)
        
        # Test the classifier on the train_opt data
        probabilities_train_opt = clf.predict_proba(X_train_opt)
        scores_train_opt = probabilities_train_opt[:, 1]
        
        # Get the auc
        auc_train_opt = metrics.roc_auc_score(y_train_opt, scores_train_opt)
        results.append({
            'AUC': auc_train_opt,
            'k': clf.n_neighbors,
            'Set': 'Train'
        })

        # Test the classifier on the test data
        probabilities = clf.predict_proba(X_validation)
        scores = probabilities[:, 1]
        
        # Get the auc
        auc = metrics.roc_auc_score(y_validation, scores)
        results.append({
            'AUC': auc,
            'k': clf.n_neighbors,
            'Set': 'Validation'
        })
        
    # Create results dataframe and plot it
    results = pd.DataFrame(results)


    optimal_n = int(np.mean(best_cls))


    # Use the optimal parameters without any tuning to validate the optimal classifier
    clf = KNeighborsClassifier(weights='distance', n_neighbors=optimal_n)
    # Fit on the entire dataset
    clf.fit(X_train_pca, y_train)

    # Test the classifier on the indepedent test data
    probabilities_train = clf.predict_proba(X_train_pca)
    probabilities_test = clf.predict_proba(X_test_pca)
    scores_train = probabilities_train[:, 1]
    scores_test = probabilities_test[:, 1]

    # Get the auc
    auc_knn_train = metrics.roc_auc_score(y_train, scores_train)
    auc_knn_test = metrics.roc_auc_score(y_test, scores_test)


    tn, fp, fn, tp = metrics.confusion_matrix(y_test, clf.predict(X_test_pca)).ravel()
    specificity_knn = tn / (tn+fp)
    sensitivity_knn = tp/(tp+fn)


    return auc_svc_train, auc_svc_test,sensitivity_svc, specificity_svc, auc_knn_train, auc_knn_test, sensitivity_knn, specificity_knn