# Run the might and other models on the cancer data

In [1]:
# import libraries
import tarfile
import os
import numpy as np
from scipy.stats import multivariate_normal
from scipy.stats import entropy
from sklearn import metrics
from sklearn.metrics import roc_auc_score

# import matplotlib.pyplot as plt
# import seaborn as sns
import pandas as pd
import warnings
import string
warnings.filterwarnings("ignore")

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

from sktree.stats import build_oob_forest
from sktree.ensemble import HonestForestClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [2]:
# load the .tar.gz file
# tarfile.open("./ManuscriptFeatureMatrices.tar.gz", "r:gz").extractall()

# load the data
filelist = os.listdir("./ManuscriptFeatureMatrices")
filelist = [f for f in filelist if f.endswith(".csv")]
n_files = len(filelist)
# print(filelist)
# print(len(filelist))
# print(filelist)

# get the sample list
sample_list_file = "./ManuscriptFeatureMatrices/AllSamples.MIGHT.Passed.samples.txt"
sample_list = pd.read_csv(sample_list_file, sep=" ", header=None)
sample_list.columns = ["library", "sample_id", "cohort"]
sample_list.head()
# get the sample_ids where cohort is Cohort1
cohort1 = sample_list[sample_list["cohort"] == "Cohort1"]["sample_id"]
cohort2 = sample_list[sample_list["cohort"] == "Cohort2"]["sample_id"]
# print(cohort1)

In [3]:
# define a function to get X and y given a file

def get_X_y(f, root="./ManuscriptFeatureMatrices/", cohort=cohort1, verbose=False):
    df = pd.read_csv(root + f)
    non_features = ['Run', 'Sample', 'Library', 'Cancer Status', 'Tumor type', 'Stage', 'Library volume (uL)', 'Library Volume', 'UIDs Used', 'Experiment', 'P7', 'P7 Primer', 'MAF']
    sample_ids = df["Sample"]
    # if sample is contains "Run" column, remove it
    for i, sample_id in enumerate(sample_ids):
        if "." in sample_id:
            sample_ids[i] = sample_id.split(".")[1]
    target = 'Cancer Status'
    y = df[target]
    # convert the labels to 0 and 1
    y = y.replace("Healthy", 0)
    y = y.replace("Cancer", 1)
    # remove the non-feature columns if they exist
    for col in non_features:
        if col in df.columns:
            df = df.drop(col, axis=1)
    nan_cols = df.isnull().all(axis=0).to_numpy()
    # drop the columns with all nan values
    df = df.loc[:, ~nan_cols]
    # if cohort is not None, filter the samples
    if cohort is not None:
        # filter the rows with cohort1 samples
        X = df[sample_ids.isin(cohort)]
        y = y[sample_ids.isin(cohort)]
    else:
        X = df
    if "Wise" in f:
        # replace nans with zero
        X = X.fillna(0)
    # impute the nan values with the mean of the column
    X = X.fillna(X.mean(axis=0))
    # check if there are nan values
    # nan_rows = X.isnull().any(axis=1)
    nan_cols = X.isnull().all(axis=0)
    # remove the columns with all nan values
    X = X.loc[:, ~nan_cols]
    if verbose:
        if nan_cols.sum() > 0:
            print(f)
            print(f"nan_cols: {nan_cols.sum()}")
            print(f"X shape: {X.shape}, y shape: {y.shape}")
        else:
            print(f)
            print(f"X shape: {X.shape}, y shape: {y.shape}")
    # X = X.dropna()
    # y = y.drop(nan_rows.index)
        
    return X, y


In [4]:
for i in range(len(filelist[:])):
    X, y = get_X_y(filelist[i], cohort=cohort2, verbose=False)
    print(filelist[i], X.shape)
    # calculate the y distribution
    # y_dist = y.value_counts(normalize=False)
    # print(y_dist)


Delfi.LengthRatio.csv (1447, 577)
MotifAnalysis.OutsideTrimer.csv (1447, 65)
MotifAnalysis.OutsideDimer.csv (1447, 17)
DyadAccessibility.LINEs.csv (1447, 300)
LociFraction.cCREFraction.csv (1447, 5)
MotifAnalysis.InsideMonomer.csv (1447, 5)
LengthAnalysis.Length20.csv (1447, 22)
MotifAnalysis.InsideDimer.csv (1447, 17)
DyadAccessibility.cCRE_CTCF.csv (1447, 300)
LociFraction.CompartmentFraction.csv (1447, 5)
LociFraction.LINEsFraction.csv (1447, 3)
DyadAccessibility.cCRE_DistalEnhancers.csv (1447, 300)
MotifAnalysis.CongruentTetramer.csv (1447, 257)
LengthAnalysis.Length5.csv (1447, 86)
MotifAnalysis.OutsideTetramer.csv (1447, 257)
WiseCondorX.Wise-5.csv (1447, 510)
MotifAnalysis.InsidePentamer.csv (1447, 1025)
MotifAnalysis.CongruentHexamer.csv (1447, 4097)
DyadAccessibility.AluS.csv (1447, 300)
WiseCondorX.Wise-1.csv (1447, 2523)
MotifAnalysis.OutsideMonomer.csv (1447, 5)
LengthAnalysis.Length1.csv (1447, 430)
WiseCondorX.Wise-10.csv (1447, 284)
MotifAnalysis.InsideHexamer.csv (1447,