In [1]:
import numpy as np 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_selection import RFE
rf = RandomForestClassifier()
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from scipy.stats import randint

In [2]:
data = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD_90percent_sample.csv', index_col='PatientID')
data_target = pd.read_csv('~/Desktop/project_data_new/target_768_avg_expanded.csv', index_col=0)
# only keep the columns with category in the name
data_target = data_target.loc[:, data_target.columns.str.contains('category')]
data = data[data.index.isin(data_target.index)]
print(data.shape, data_target.shape)

(407, 768) (449, 241)


In [3]:
data_test = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD_10percent_sample.csv', index_col='PatientID')
data_test = data_test[data_test.index.isin(data_target.index)]
data_test.shape

(42, 768)

In [4]:
# Align the targets for training and unseen test sets
data_target_train = data_target.loc[data.index]
data_target_unseen = data_target.loc[data_test.index]
data_target_train.shape

(407, 241)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score
from scipy.stats import randint
import os

# ===========================
# Configuration and Data Setup
# ===========================
# data: 407 x 768 features (90% of samples)
# data_target_train: 407 x 241 target variables
# data_test: Matching features for the test set
# data_target_unseen: Matching targets for the test set

X = data.values  # All 407 samples with 768 features
X_test = data_test.values

targets = data_target_train.columns
n_targets = len(targets) 
chunk_size = 40 # process 40 targets at a time (to avoid memory issues)

# Directory for saving outputs
output_dir = "/home/qiuaodon/Desktop/CRC_image/Best_100_features_Randomforest_90percents_version2"
os.makedirs(output_dir, exist_ok=True)

# ===========================
# Parameter Distributions for Hyperparameter Tuning
# ===========================
param_dist = {
    'clf__n_estimators': randint(100, 200),
    'clf__max_depth': randint(20, 40),
    'clf__min_samples_split': randint(10, 20),
    'clf__min_samples_leaf': randint(1, 5),
    'clf__max_features': ['sqrt', 'log2']
}

# ===========================
# Pipeline Setup
# ===========================
# The pipeline first performs RFE (for feature selection) using an internal RF model,
# then trains a Random Forest Classifier with the selected features.
# 'rfe__n_features_to_select': Fixed at 100 for top feature selection.
# The base estimator inside RFE is a simple RF with fixed parameters, 
# since RFE is done as part of the pipeline and will be repeated during hyperparameter tuning.
base_rf_for_rfe = RandomForestClassifier(random_state=42)
pipeline = Pipeline([
    ('rfe', RFE(estimator=base_rf_for_rfe, n_features_to_select=100, step=1)),
    ('clf', RandomForestClassifier(random_state=42))
])

# ===========================
# Main Loop over Target Variables in Chunks
# ===========================
for chunk_start in range(0, n_targets, chunk_size):
    chunk_end = min(chunk_start + chunk_size, n_targets)
    target_chunk = targets[chunk_start:chunk_end]

    # Prepare results storage
    chunk_results = []

    # Classification reports file for this chunk
    report_file_path = os.path.join(
        output_dir, 
        f"Classification_Reports_top100_features_part_{chunk_start}_{chunk_end}.txt"
    )

    with open(report_file_path, "w") as report_file:
        for target_col in target_chunk:
            # Extract the target variable for training/validation
            Y = data_target_train[target_col].values

            # Split into training and validation
            X_train, X_val, Y_train, Y_val = train_test_split(
                X, Y, test_size=0.2, random_state=42, stratify=Y
            )

            # Hyperparameter tuning using RandomizedSearchCV
            random_search = RandomizedSearchCV(
                estimator=pipeline,
                param_distributions=param_dist,
                n_iter=60,
                cv=5,
                scoring='f1_weighted',
                n_jobs=-1,
                verbose=1,
                random_state=42,
                refit=True
            )
            random_search.fit(X_train, Y_train)

            # Best model after hyperparameter search
            best_model = random_search.best_estimator_
            best_params = random_search.best_params_

            # Evaluate on Training Set
            Y_train_pred = best_model.predict(X_train)
            train_report_str = classification_report(Y_train, Y_train_pred)

            # Evaluate on Validation Set
            Y_val_pred = best_model.predict(X_val)
            val_report = classification_report(Y_val, Y_val_pred, output_dict=True)
            val_report_str = classification_report(Y_val, Y_val_pred)

            # Evaluate on Test Set (Unseen Data)
            # Transform test data using the pipeline (it will apply the same RFE)
            Y_test = data_target_unseen[target_col].values
            Y_test_pred = best_model.predict(X_test)
            test_report = classification_report(Y_test, Y_test_pred, output_dict=True)
            test_report_str = classification_report(Y_test, Y_test_pred)

            # Write reports to the file
            report_file.write(f"Target Variable: {target_col}\n")
            report_file.write("=== Training Set Report ===\n")
            report_file.write(train_report_str + "\n\n")
            report_file.write("=== Validation Set Report ===\n")
            report_file.write(val_report_str + "\n\n")
            report_file.write("=== Test Set Report ===\n")
            report_file.write(test_report_str + "\n")
            report_file.write("="*80 + "\n\n")

            # Compute metrics for Validation Set
            val_precision = precision_score(Y_val, Y_val_pred, average='weighted')
            val_recall = recall_score(Y_val, Y_val_pred, average='weighted')
            val_accuracy = accuracy_score(Y_val, Y_val_pred)

            # Extract Class 1 metrics from Validation (if class '1' exists)
            class_1_val_metrics = val_report.get('1', {"precision": None, "recall": None, "f1-score": None})

            # Compute metrics for Test Set
            test_precision = precision_score(Y_test, Y_test_pred, average='weighted')
            test_recall = recall_score(Y_test, Y_test_pred, average='weighted')
            test_accuracy = accuracy_score(Y_test, Y_test_pred)

            # Extract Class 1 metrics from Test (if class '1' exists)
            class_1_test_metrics = test_report.get('1', {"precision": None, "recall": None, "f1-score": None})

            # Append metrics to chunk_results
            chunk_results.append({
                "Target Variable": target_col,
                "Validation Precision": val_precision,
                "Validation Recall": val_recall,
                "Validation Accuracy": val_accuracy,
                "Class 1 Precision (Validation)": class_1_val_metrics["precision"],
                "Class 1 Recall (Validation)": class_1_val_metrics["recall"],
                "Class 1 F1-Score (Validation)": class_1_val_metrics["f1-score"],
                "Test Precision": test_precision,
                "Test Recall": test_recall,
                "Test Accuracy": test_accuracy,
                "Class 1 Precision (Test)": class_1_test_metrics["precision"],
                "Class 1 Recall (Test)": class_1_test_metrics["recall"],
                "Class 1 F1-Score (Test)": class_1_test_metrics["f1-score"],
                "Best Hyperparameters": best_params
            })

    # Save the metrics for the chunk to an Excel file
    results_df = pd.DataFrame(chunk_results)
    results_file_path = os.path.join(
        output_dir,
        f"Metrics_top100_features_part_{chunk_start}_{chunk_end}.xlsx"
    )
    results_df.to_excel(results_file_path, index=False)


Fitting 5 folds for each of 60 candidates, totalling 300 fits


# combine the results

In [None]:
import pandas as pd
import glob

# Specify the directory where your files are located
file_path_pattern = '/home/qiuaodon/Desktop/CRC_image/Best_100_features_Randomforest_90percents_version2/Precision_Recall_Accuracy_100features_part_*.xlsx'

# Use glob to find all matching files
all_files = glob.glob(file_path_pattern)

# Create an empty list to store DataFrames
data_frames = []

# Loop through each file and append its DataFrame to the list
for file in all_files:
    df = pd.read_excel(file)
    data_frames.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(data_frames, ignore_index=True)

# Save the combined DataFrame to a new Excel file
combined_df.to_excel('/home/qiuaodon/Desktop/CRC_image/Best_100_features_Randomforest_90percents_version2/top_100features_Randomforest_241_targets_90train.xlsx', index=False)
combined_df


Unnamed: 0,Target Variable,Validation Precision,Validation Recall,Validation Accuracy,Class 1 Precision (Validation),Class 1 Recall (Validation),Class 1 F1-Score (Validation),Unseen Test Precision,Unseen Test Recall,Unseen Test Accuracy,Class 1 Precision (Unseen Test),Class 1 Recall (Unseen Test),Class 1 F1-Score (Unseen Test),Best Hyperparameters
0,category_tnk_1,0.476098,0.487805,0.487805,0.535714,0.555556,0.545455,0.371849,0.357143,0.357143,0.294118,0.357143,0.322581,"{'max_depth': 25, 'max_features': 'log2', 'min..."
1,category_tnk_2,0.514563,0.512195,0.512195,0.483871,0.555556,0.517241,0.543515,0.404762,0.404762,0.375000,0.400000,0.387097,"{'max_depth': 23, 'max_features': 'sqrt', 'min..."
2,category_tnk_3,0.483284,0.487805,0.487805,0.448276,0.464286,0.456140,0.180934,0.166667,0.166667,0.125000,0.181818,0.148148,"{'max_depth': 37, 'max_features': 'log2', 'min..."
3,category_tnk_4,0.560694,0.548780,0.548780,0.650000,0.500000,0.565217,0.588409,0.547619,0.547619,0.733333,0.523810,0.611111,"{'max_depth': 28, 'max_features': 'sqrt', 'min..."
4,category_tnk_5,0.546459,0.548780,0.548780,0.560000,0.518519,0.538462,0.353672,0.309524,0.309524,0.266667,0.285714,0.275862,"{'max_depth': 36, 'max_features': 'sqrt', 'min..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,category_stromal_10,0.456924,0.451220,0.451220,0.451613,0.500000,0.474576,0.480866,0.428571,0.428571,0.315789,0.600000,0.413793,"{'max_depth': 30, 'max_features': 'log2', 'min..."
237,category_stromal_11,0.501768,0.500000,0.500000,0.523810,0.407407,0.458333,0.367565,0.333333,0.333333,0.363636,0.266667,0.307692,"{'max_depth': 39, 'max_features': 'sqrt', 'min..."
238,category_stromal_12,0.426071,0.426829,0.426829,0.379310,0.407407,0.392857,0.379085,0.404762,0.404762,0.500000,0.500000,0.500000,"{'max_depth': 31, 'max_features': 'log2', 'min..."
239,category_stromal_13,0.565932,0.560976,0.560976,0.636364,0.500000,0.560000,0.391484,0.357143,0.357143,0.250000,0.363636,0.296296,"{'max_depth': 22, 'max_features': 'sqrt', 'min..."
