In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [12]:
data = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD.csv')
data.index = data['PatientID']
# drop the last two columns
data = data.drop(data.columns[-2:], axis=1)

In [19]:
data = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD_90percent_sample.csv')
data.index = data['PatientID']
# drop the 'PatientID' column
data = data.drop('PatientID', axis=1)

In [20]:
data_target = pd.read_csv('~/Desktop/project_data_new/target_768_avg_expanded.csv')

In [21]:
data_target.index = data_target['Unnamed: 0']
data_target = data_target.drop(['Unnamed: 0'], axis = 1)
# only keep the columns with category in the name
data_target = data_target.loc[:, data_target.columns.str.contains('category')]

In [22]:
data = data[data.index.isin(data_target.index)]

In [23]:
data_target = data_target[data_target.index.isin(data.index)]

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score
from scipy.stats import randint

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Extract the feature values from data
X = data.values
# data_target only keep the index in data
data_target = data_target[data_target.index.isin(data.index)]
# Specify the single target variable
target_col = "category_stromal_34"

# Initialize a list to store results
results = []

# Set the path for the classification report file
# report_file_path = f"/home/qiuaodon/Desktop/CRC_image/Best_100_features_Randomforest_90percents/Classification_Report_100features_{target_col}.txt"

# Extract the target column for the current category from training/validation set
Y = data_target[target_col].values

# Select top 100 features using RFE
rfe = RFE(rf, n_features_to_select=100)  # Choose top 100 features
X_selected = rfe.fit_transform(X, Y)

# Split the remaining training/validation set into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(X_selected, Y, test_size=0.2, random_state=42, stratify=Y)

# Define the hyperparameter distribution
param_dist = {
    'n_estimators': randint(251, 253),
    'max_depth': randint(33, 35),
    'min_samples_split': randint(10, 12),
    'min_samples_leaf': randint(1, 2),
    'max_features': ['sqrt', 'log2']
}
{'max_depth': 34,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 11,
 'n_estimators': 252}

# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=200, 
                                   cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, Y_train)

# Get the best model from Randomized Search
best_rf = random_search.best_estimator_
best_params = random_search.best_params_

# Fit the model on the training data for evaluation
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
train_report = classification_report(Y_train, Y_train_pred)

# Evaluate on the validation set
Y_val_pred = best_rf.predict(X_val)
val_report = classification_report(Y_val, Y_val_pred, output_dict=True)


# Calculate metrics for validation set
precision = precision_score(Y_val, Y_val_pred, average='weighted')
recall = recall_score(Y_val, Y_val_pred, average='weighted')
accuracy = accuracy_score(Y_val, Y_val_pred)

# Extract validation Class 1 metrics
class_1_metrics = val_report.get('1', {"precision": None, "recall": None, "f1-score": None})



# Append results for the target variable
results.append({
    "Target Variable": target_col,
    "Validation Precision": precision,
    "Validation Recall": recall,
    "Validation Accuracy": accuracy,
    "Class 1 Precision (Validation)": class_1_metrics["precision"],
    "Class 1 Recall (Validation)": class_1_metrics["recall"],
    "Class 1 F1-Score (Validation)": class_1_metrics["f1-score"],
    "Best Hyperparameters": best_params
})

# Save the results to an Excel file
results_df = pd.DataFrame(results)
results_df

In [26]:
best_params

{'max_depth': 46,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 11,
 'n_estimators': 204}

In [24]:
best_params

{'max_depth': 34,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 11,
 'n_estimators': 252}