In [1]:
#NOTE: use paimg1 env, the retccl one has package issue with torchvision
import sys
import os
import numpy as np
import openslide
import matplotlib.pyplot as plt

import matplotlib
matplotlib.use('Agg')
import pandas as pd
import warnings
import torch
import torch.nn as nn

from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader
import torch.optim as optim
from pathlib import Path
from scipy.spatial.distance import cdist
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

sys.path.insert(0, '../Utils/')
from Utils import create_dir_if_not_exists
from Utils import generate_deepzoom_tiles, extract_tile_start_end_coords, get_map_startend
from Utils import get_downsample_factor
from Utils import minmax_normalize, count_label
from Utils import log_message, set_seed
from Utils import simple_line_plot
from cluster_utils import get_cluster_data, get_cluster_label, get_updated_feature, get_pca_components
from Eval import compute_performance, plot_LOSS, compute_performance_each_label, get_attention_and_tileinfo
from train_utils import pull_tiles, get_feature_label_array_dynamic
from train_utils import ModelReadyData_diffdim, convert_to_dict, prediction
from Model import Mutation_MIL_MT
warnings.filterwarnings("ignore")
%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score,average_precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
####################################
######      USERINPUT       ########
####################################
SELECTED_LABEL = ["AR","MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2","PTEN","RB1","TP53","TMB_HIGHorINTERMEDITATE","MSI_POS"]
TUMOR_FRAC_THRES = 0
TRAIN_SAMPLE_SIZE = "ALLTUMORTILES"
TRAIN_OVERLAP = 100
TEST_OVERLAP = 0
SELECTED_FOLD = 0
CLUSTER_ALG = 'KMEAN'
N_CLUSTERS = 4
CLUSTER_DIST = 'L2'
feature_extraction_method = 'retccl'
SELECTED_FEATURE  = list(['C_' + str(x) for x in range(0,N_CLUSTERS)])

##################
###### DIR  ######
##################
proj_dir = '/fh/fast/etzioni_r/Lucas/mh_proj/mutation_pred/'
data_dir = proj_dir + 'intermediate_data/model_ready_data/feature_' + feature_extraction_method + '/MAXSS'+ str(TRAIN_SAMPLE_SIZE)  + '_TrainOL' + str(TRAIN_OVERLAP) +  '_TestOL' + str(TEST_OVERLAP) + '_TFT' + str(TUMOR_FRAC_THRES) + "/split_fold" + str(SELECTED_FOLD) + "/"
feature_path =  os.path.join(data_dir, "clusters", CLUSTER_ALG, "ML_Updated_Features_OnlyClusterPerc")
label_path = os.path.join(data_dir, "clusters", CLUSTER_ALG, "ClusterInfo")
save_name = "_NCLUSTER_" + str(N_CLUSTERS) +  "_DISTMETRIC_" + CLUSTER_DIST

################################################
#Create output dir
################################################
#outdir3 =   os.path.join(data_dir, "clusters", CLUSTER_ALG, "ML_Updated_Features_OnlyClusterPerc")
#create_dir_if_not_exists(outdir3)

##################
#Select GPU
##################
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
set_seed(0)

cuda:0


In [3]:
############################################################################################################
#Get features and labels
############################################################################################################
train_ml_df = torch.load(feature_path + '/updated_train_feature' + save_name + 'cluster_perc.pth')
train_ml_df.rename(columns = {'ID': 'SAMPLE_ID'}, inplace = True)
test_ml_df = torch.load(feature_path + '/updated_test_feature' + save_name + 'cluster_perc.pth')
test_ml_df.rename(columns = {'ID': 'SAMPLE_ID'}, inplace = True)
val_ml_df = torch.load(feature_path + '/updated_val_feature' + save_name + 'cluster_perc.pth')
val_ml_df.rename(columns = {'ID': 'SAMPLE_ID'}, inplace = True)


train_info_df = pd.read_csv(label_path + '/train_cluster_info' + save_name + '.csv')
train_info_df.drop_duplicates(subset = ['SAMPLE_ID'], inplace = True)
test_info_df = pd.read_csv(label_path + '/test_cluster_info' + save_name + '.csv')
test_info_df.drop_duplicates(subset = ['SAMPLE_ID'], inplace = True)
val_info_df = pd.read_csv(label_path + '/valid_cluster_info' + save_name + '.csv')
val_info_df.drop_duplicates(subset = ['SAMPLE_ID'], inplace = True)


In [4]:
train_df = train_ml_df.merge(train_info_df[['SAMPLE_ID'] + SELECTED_LABEL], on = ['SAMPLE_ID'])
test_df = test_ml_df.merge(test_info_df[['SAMPLE_ID'] + SELECTED_LABEL], on = ['SAMPLE_ID'])
val_df = val_ml_df.merge(val_info_df[['SAMPLE_ID'] + SELECTED_LABEL], on = ['SAMPLE_ID'])

In [5]:
#Correlation test
#train_df[SELECTED_LABEL].corr()
from sklearn.utils import resample

In [20]:
selected_methods = ['LR','RF','XGBoost','SVM']
THRES = 0.5
all_perf_list = []
for method in selected_methods:
    perf_list = []
    for label in SELECTED_LABEL:    
        X_train , y_train = train_df[SELECTED_FEATURE], train_df[label]
        X_test , y_test = test_df[SELECTED_FEATURE], test_df[label]
        X_val , y_val = val_df[SELECTED_FEATURE], val_df[label]

        # Separate the majority and minority classes
        X_train_majority = X_train[y_train == 0]
        y_train_majority = y_train[y_train == 0]
        X_train_minority = X_train[y_train == 1]
        y_train_minority = y_train[y_train == 1]
        
        # Upsample the minority class
        X_train_minority_upsampled, y_train_minority_upsampled = resample(
            X_train_minority, y_train_minority,
            replace=True,  # Sample with replacement
            n_samples=len(X_train_majority),  # Match number of majority class samples
            random_state=42  # Reproducible results
        )
        
        # Combine the majority class with the upsampled minority class
        X_train_upsampled = np.vstack((X_train_majority, X_train_minority_upsampled))
        y_train_upsampled = np.hstack((y_train_majority, y_train_minority_upsampled))

        if method == 'LR':
            model = LogisticRegression()
        elif method == 'RF':
            model = RandomForestClassifier(n_estimators=500, random_state=42)
        elif method == 'XGBoost':
            model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        elif method == 'SVM':
            model = SVC(probability=True)
        # Train the model
        model.fit(X_train_upsampled, y_train_upsampled)
        
        # Make predictions
        y_pred = model.predict_proba(X_test)[:,1]        
        y_pred_c = ([float(t > THRES) for t in y_pred])
        
        # # Evaluate the model
        cur_perf_df = compute_performance(y_test,y_pred,y_pred_c,"")
        cur_perf_df['Label'] =  label
        perf_list.append(cur_perf_df)
    perf_df = pd.concat(perf_list)
    mean_values = perf_df[['AUC', 'ACC', 'F1', 'F2', 'F3', 'Recall', 'Precision', 'Specificity', 'PR_AUC']].mean()
    perf_df.loc['mean'] = mean_values
    perf_df['Method'] = method
    perf_df.loc['mean','Method'] = method + '_AVG'
    perf_df.reset_index(drop = True, inplace = True)
    all_perf_list.append(perf_df)

all_perf_df = pd.concat(all_perf_list)
print(all_perf_df.loc[all_perf_df['Method'].str.contains('AVG')])

        AUC       ACC        F1        F2        F3    Recall  Precision  \
7  0.678571  0.605714  0.327143  0.454286  0.540000  0.700000   0.244286   
7  0.587143  0.794286  0.172857  0.161429  0.158571  0.155714   0.201429   
7  0.560000  0.774286  0.177143  0.181429  0.185714  0.194286   0.198571   
7  0.610000  0.678571  0.297143  0.357143  0.388571  0.432857   0.242857   

   Specificity    PR_AUC Label       Method  
7     0.612857  0.316512   NaN       LR_AVG  
7     0.891429  0.261478   NaN       RF_AVG  
7     0.877143  0.252612   NaN  XGBoost_AVG  
7     0.717143  0.297809   NaN      SVM_AVG  


In [21]:
all_perf_df.loc[all_perf_df['Method'].str.contains('LR')]

Unnamed: 0,AUC,ACC,F1,F2,F3,Recall,Precision,Specificity,PR_AUC,Label,Method
0,0.61,0.62,0.21,0.29,0.34,0.4,0.14,0.66,0.176533,AR,LR
1,0.5,0.45,0.21,0.33,0.39,0.5,0.14,0.44,0.299074,"MMR (MSH2, MSH6, PMS2, MLH1, MSH3, MLH3, EPCAM)2",LR
2,0.47,0.68,0.38,0.42,0.43,0.44,0.33,0.74,0.247396,PTEN,LR
3,0.8,0.7,0.4,0.62,0.77,1.0,0.25,0.67,0.305556,RB1,LR
4,0.74,0.62,0.55,0.56,0.56,0.56,0.53,0.67,0.671258,TP53,LR
5,0.82,0.65,0.3,0.52,0.68,1.0,0.18,0.62,0.27549,TMB_HIGHorINTERMEDITATE,LR
6,0.81,0.52,0.24,0.44,0.61,1.0,0.14,0.49,0.240278,MSI_POS,LR
7,0.678571,0.605714,0.327143,0.454286,0.54,0.7,0.244286,0.612857,0.316512,,LR_AVG


In [55]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset

BATCH_SIZE = 8
# Example data
X_train , y_train = train_df[SELECTED_FEATURE], train_df[label]
X_test , y_test = test_df[SELECTED_FEATURE], test_df[label]
X_val , y_val = val_df[SELECTED_FEATURE], val_df[label]

# Separate the majority and minority classes
X_train_majority = X_train[y_train == 0]
y_train_majority = y_train[y_train == 0]
X_train_minority = X_train[y_train == 1]
y_train_minority = y_train[y_train == 1]

# Upsample the minority class
X_train_minority_upsampled, y_train_minority_upsampled = resample(
    X_train_minority, y_train_minority,
    replace=True,  # Sample with replacement
    n_samples=len(X_train_majority),  # Match number of majority class samples
    random_state=42  # Reproducible results
)

# Combine the majority class with the upsampled minority class
X_train_upsampled = np.vstack((X_train_majority, X_train_minority_upsampled))
y_train_upsampled = np.hstack((y_train_majority, y_train_minority_upsampled))


class ModelReadyData_MT_V2(Dataset):
    def __init__(self,
                 feature_df,
                 label_df,
                ):
        
        self.x = torch.FloatTensor(feature_df)
        
        # Get the Y labels
        self.y = torch.FloatTensor(label_df)
        
    def __len__(self): 
        return len(self.x)
    
    def __getitem__(self,index):
        # Given an index, return a tuple of an X with it's associated Y
        x = self.x[index]
        y = self.y[index]
        
        return x, y

train_data = ModelReadyData_MT_V2(X_train_upsampled, y_train_upsampled)
test_data = ModelReadyData_MT_V2(X_test.to_numpy(), y_test.to_numpy())

train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=False)

# Define the model
class LogisticModel(nn.Module):
    def __init__(self):
        super(LogisticModel, self).__init__()
        self.linear = nn.Linear(4, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return self.sigmoid(self.linear(x))

# Create the model
model = LogisticModel()

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [49]:
# Train the model
for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    for x, y in train_loader:
        outputs = model(x)
        loss = criterion(outputs.squeeze(), y)
        loss.backward()
        optimizer.step()

#Validation
model.eval()
with torch.no_grad():
    y_hat  = model(test_data.x)
# Print the learned parameters
#print(f'Learned parameters: {model.linear.weight.item()}, {model.linear.bias.item()}')