In [2]:
import math
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
# import seaborn as sns
from joblib import Parallel, delayed
from scipy.integrate import nquad, quad, simps
from scipy.stats import entropy, gamma, multivariate_normal, norm
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize
from sktree.ensemble import HonestForestClassifier
from sktree.stats import build_hyppo_oob_forest
from sktree.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, roc_curve
import time

In [3]:
def sensitivity_at_specificity(y_true, y_score, target_specificity=0.98, pos_label=1):
    n_trees, n_samples, n_classes = y_score.shape

    # Compute nan-averaged y_score along the trees axis
    y_score_avg = np.nanmean(y_score, axis=0)

    # Extract true labels and nan-averaged predicted scores for the positive class
    y_true = y_true.ravel()
    y_score_binary = y_score_avg[:, 1]

    # Identify rows with NaN values in y_score_binary
    nan_rows = np.isnan(y_score_binary)

    # Remove NaN rows from y_score_binary and y_true
    y_score_binary = y_score_binary[~nan_rows]
    y_true = y_true[~nan_rows]

    # Compute ROC curve
    fpr, tpr, thresholds = roc_curve(y_true, y_score_binary, pos_label=pos_label)

    # Find the threshold corresponding to the target specificity
    index = np.argmax(fpr >= (1 - target_specificity))
    threshold_at_specificity = thresholds[index]

    # Compute sensitivity at the chosen specificity
    # sensitivity = tpr[index]
    # return sensitivity

    # Use the threshold to classify predictions
    y_pred_at_specificity = (y_score_binary >= threshold_at_specificity).astype(int)

    # Compute sensitivity at the chosen specificity
    sensitivity = np.sum((y_pred_at_specificity == 1) & (y_true == 1)) / np.sum(
        y_true == 1
    )

    return sensitivity

In [4]:
# FILE_PATH = '/Users/baiyuxin/Desktop/JHU/NDD/Cancer/mendseqs/'
# FILE = ["alufraction.csv.pkl","mendseq.featurematrix.csv.pkl","MendSeqS_Length.featurematrix.csv.pkl","wps.featurematrix.csv.pkl"]
FILE_PATH ='/Users/baiyuxin/Desktop/JHU/NDD/Cancer/MIGHT_TEST_v0.6.1/alus_ratio.csv'
N_ESTIMATORS = 1000
# N_ESTIMATORS[0] = 10
REPS = 5

In [5]:
# data = np.load(FILE_PATH+FILE[1], allow_pickle=True)
data = pd.read_csv(FILE_PATH)
data.to_pickle(FILE_PATH + ".pkl")


data["Cancer Status"] = data["Cancer Status"].replace(["Cancer", "Yes"], 1)
data["Cancer Status"] = data["Cancer Status"].replace(["Healthy", "No", "Normal"], 0)
data["Sample"] = data["Experiment"] + "." + data["Sample"]
print(data)


train = []
with open('/Users/baiyuxin/Desktop/JHU/NDD/Cancer/MIGHT_TEST_v0.6.1/Cohort1.samples.txt', "r") as mydata:
       print(mydata)
       for line in mydata:
        if line.startswith("Experiment"):
            continue
        data_i = [item.strip() for item in line.split(" ")]
        # print(data_i)
        train.append(data_i[0] + "." + data_i[1])
print(train)
data = data[data["Sample"].isin(train)]

data.index = range(len(data))
print(data.shape)



columns_to_remove = [
                "Experiment",
                "Run",
                "Sample",
                "Library",
                "Cancer Status",
                "Tumor type",
                "MAF",
                "Stage",
                "P7",
                "P7 primer",
                "P7 Primer",
                "Library volume",
                "Library Volume",
                "UIDs Used",
                "Avg GC",
                "Library volume (uL)",
                "Total Reads",
                "Total Alu",
            ]

y = data["Cancer Status"].to_numpy()
features = data.loc[:, ~data.columns.isin(columns_to_remove)].columns.tolist()
X = data.loc[:, features].to_numpy()
X = np.nan_to_num(X)
X.shape, y.shape

     Experiment                  Sample  Cancer Status  Tumor type Stage  \
0         S0028   S0028.INDI_918_PLS_1A              1     Stomach    IV   
1         S0028    S0028.INDI_980_PLS_1              1     Stomach    IV   
2         S0034   S0034.INDI_580_PLS_1A              1  Colorectal    IV   
3         S0034   S0034.INDI_730_PLS_1A              1    Pancreas    IV   
4         S0034   S0034.INDI_481_PLS_1A              1       Liver    IV   
...         ...                     ...            ...         ...   ...   
1987      S0294  S0294.INDIA_3493_PLS_1              1     Stomach   IIB   
1988      S0294  S0294.INDIA_3494_PLS_1              1     Stomach  IIIA   
1989      S0294  S0294.INDIA_3496_PLS_1              1     Stomach    IA   
1990      S0294  S0294.INDIA_3500_PLS_1              1     Stomach   IIA   
1991      S0294  S0294.INDIA_3482_PLS_1              1     Stomach    IB   

          AluY      AluS      AluJ  
0     0.120170  0.611256  0.268575  
1     0.12044

  data["Cancer Status"] = data["Cancer Status"].replace(["Healthy", "No", "Normal"], 0)


((352, 3), (352,))

In [7]:
### Try 1k estimators (same parameters and data preprocessing with Sam's code)
est = HonestForestClassifier(n_estimators=1000,
# random_state=seed,
honest_fraction=0.5,
n_jobs=-1,
bootstrap=True,
stratify=True,
max_samples=1.6,
max_features=0.3
# permute_per_tree=True,
)
S98 = []
for i in range(20):
    _, posterior_arr = build_hyppo_oob_forest(
    est,
    X,
    y,
    verbose=False,
    )
    sas98 = sensitivity_at_specificity(
    y, posterior_arr, target_specificity=0.98
    )
    print(sas98)
    S98.append(sas98)

0.38235294117647056
0.37254901960784315
0.38235294117647056
0.38235294117647056
0.37254901960784315
0.38235294117647056
0.38235294117647056
0.38235294117647056
0.38235294117647056
0.37254901960784315
0.38235294117647056
0.37254901960784315
0.37254901960784315
0.37254901960784315
0.38235294117647056
0.37254901960784315
0.38235294117647056
0.37254901960784315
0.37254901960784315
0.38235294117647056


In [7]:
### Try with 100k estimators
est = HonestForestClassifier(n_estimators=100000,
# random_state=seed,
honest_fraction=0.5,
n_jobs=-1,
bootstrap=True,
stratify=True,
max_samples=1.6,
max_features=0.3
# permute_per_tree=True,
)
S98 = []
for i in range(2):
    _, posterior_arr = build_hyppo_oob_forest(
    est,
    X,
    y,
    verbose=False,
    )
    sas98 = sensitivity_at_specificity(
    y, posterior_arr, target_specificity=0.98
    )
    print(sas98)
    S98.append(sas98)

0.37254901960784315
0.37254901960784315


In [8]:
S98

[0.37254901960784315,
 0.37254901960784315,
 0.37254901960784315,
 0.37254901960784315,
 0.37254901960784315]

In [None]:
### Try with 50k trees
### Try with 100k estimators
est = HonestForestClassifier(n_estimators=50000,
# random_state=seed,
honest_fraction=0.5,
n_jobs=-1,
bootstrap=True,
stratify=True,
max_samples=1.6,
max_features=0.3
# permute_per_tree=True,
)
S98 = []
for i in range(2):
    _, posterior_arr = build_hyppo_oob_forest(
    est,
    X,
    y,
    verbose=False,
    )
    sas98 = sensitivity_at_specificity(
    y, posterior_arr, target_specificity=0.98
    )
    print(sas98)
    S98.append(sas98)
print(S98)