In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD.csv')
data.index = data['PatientID']
# drop last 2 columns
data = data.drop(data.columns[-2:], axis=1)

In [3]:
data_target = pd.read_csv('~/Desktop/project_data_new/target_768_avg_expanded.csv')

In [4]:
data_target.index = data_target['Unnamed: 0']
data_target = data_target.drop(['Unnamed: 0'], axis = 1)
# only keep the columns with category in the name
data_target = data_target.loc[:, data_target.columns.str.contains('category')]

In [5]:
data = data[data.index.isin(data_target.index)]
data.shape

(449, 768)

In [6]:
X = data.values
# Extract the target column for category_b_12
Y = data_target['category_stromal_34'].values

In [7]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rfe = RFE(rf, n_features_to_select=100)  # Choose top 100 features
X = rfe.fit_transform(X, Y)


In [16]:
# save the selected features to csv 
np.savetxt("/home/qiuaodon/Desktop/CRC_image/Best_features_REF/Top_100_features_REF_stromal_34.csv", X, delimiter=",")

In [8]:
# read the selected features from csv
X = np.loadtxt("/home/qiuaodon/Desktop/CRC_image/Best_features_RandomForest_results/Top_100_features_REF_stromal_34.csv", delimiter=",")

In [9]:
# leave one out cross validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score, LeaveOneOut
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from scipy.stats import randint

# Extract the target column for category_stromal_34
Y = data_target['category_stromal_34'].values

# Split the data into training and testing sets (20% test set)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter distribution (use distributions for random search)
param_dist = {
    'n_estimators': randint(100, 200),            # Fewer trees may generalize better
    'max_depth': randint(20, 40),                 # Lower depth can prevent overfitting
    'min_samples_split': randint(10, 20),         # Increase split to limit tree growth
    'min_samples_leaf': randint(1, 5),            # Require more samples per leaf
    'max_features': ['sqrt', 'log2']
}

# Perform Randomized Search with Leave-One-Out cross-validation
loo = LeaveOneOut()
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=60,
    cv=loo,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X_train, Y_train)

# Get the best model from Randomized Search
best_rf = random_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Cross-validation on the training set with the best model using LOOCV
cv_scores = cross_val_score(best_rf, X_train, Y_train, cv=loo, scoring='f1_weighted', n_jobs=-1)
print("Cross-validation F1 scores on training set:", cv_scores)
print("Average cross-validation F1 score on training set:", np.mean(cv_scores))

# Fit the model on the training data for the final evaluation on test and train sets
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
print("Classification Report for stromal_34 (Train Set):")
print(classification_report(Y_train, Y_train_pred))

# Evaluate on the test set
Y_test_pred = best_rf.predict(X_test)
print("Classification Report for stromal_34 (Test Set):")
print(classification_report(Y_test, Y_test_pred))


Fitting 359 folds for each of 60 candidates, totalling 21540 fits
Best Hyperparameters: {'max_depth': 31, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 163}
Cross-validation F1 scores on training set: [0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 1.
 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 1.
 1. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0.
 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1.
 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 1. 1.
 1. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 0. 0.
 0. 1. 1. 1. 0. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1.
 0. 1. 0. 1. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1.
 0. 1. 1. 1. 1. 0. 0. 0. 0.

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from scipy.stats import randint


# Extract the target column for category_b_12
Y = data_target['category_stromal_34'].values

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.05, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter distribution (use distributions for random search)
param_dist = {
    'n_estimators': randint(100, 200),            # Fewer trees may generalize better
    'max_depth': randint(20, 40),                 # Lower depth can prevent overfitting
    'min_samples_split': randint(10, 20),         # Increase split to limit tree growth
    'min_samples_leaf': randint(1, 5),            # Require more samples per leaf
    'max_features': ['sqrt', 'log2']
}


# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=60, 
                                   cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, Y_train)

# Get the best model from Randomized Search
best_rf = random_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Cross-validation on the training set with the best model
cv_scores = cross_val_score(best_rf, X_train, Y_train, cv=5, scoring='f1_weighted')
print("Cross-validation F1 scores on training set:", cv_scores)
print("Average cross-validation F1 score on training set:", np.mean(cv_scores))

# Fit the model on the training data for the final evaluation on test and train sets
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
print("Classification Report for stromal_34 (Train Set):")
print(classification_report(Y_train, Y_train_pred))

# Evaluate on the test set
Y_test_pred = best_rf.predict(X_test)
print("Classification Report for stromal_34 (Test Set):")
print(classification_report(Y_test, Y_test_pred))


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best Hyperparameters: {'max_depth': 31, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 11, 'n_estimators': 163}
Cross-validation F1 scores on training set: [0.57375959 0.54551249 0.5131099  0.50294538 0.42389621]
Average cross-validation F1 score on training set: 0.5118447140391206
Classification Report for stromal_34 (Train Set):
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       122
           0       0.98      1.00      0.99       124
           1       1.00      0.98      0.99       113

    accuracy                           0.99       359
   macro avg       0.99      0.99      0.99       359
weighted avg       0.99      0.99      0.99       359

Classification Report for stromal_34 (Test Set):
              precision    recall  f1-score   support

          -1       0.59      0.71      0.65        28
           0       0.58      0.60      0.59  

In [21]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
X = data.values
rf = RandomForestClassifier()
rfe = RFE(rf, n_features_to_select=150)  # Choose top 150 features
X = rfe.fit_transform(X, Y)


In [22]:
# save X as Top_100_features_REF
np.savetxt("Top_150_features_REF_stromal_34.csv", X, delimiter=",")

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from scipy.stats import randint

# X = data.values

# Extract the target column for category_b_12
Y = data_target['category_stromal_34'].values

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter distribution (use distributions for random search)
param_dist = {
    'n_estimators': randint(100, 200),            # Fewer trees may generalize better
    'max_depth': randint(20, 40),                 # Lower depth can prevent overfitting
    'min_samples_split': randint(10, 20),         # Increase split to limit tree growth
    'min_samples_leaf': randint(1, 5),            # Require more samples per leaf
    'max_features': ['sqrt', 'log2']
}


# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=50, 
                                   cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, Y_train)

# Get the best model from Randomized Search
best_rf = random_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Cross-validation on the training set with the best model
cv_scores = cross_val_score(best_rf, X_train, Y_train, cv=5, scoring='f1_weighted')
print("Cross-validation F1 scores on training set:", cv_scores)
print("Average cross-validation F1 score on training set:", np.mean(cv_scores))

# Fit the model on the training data for the final evaluation on test and train sets
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
print("Classification Report for category_stromal_34 (Train Set):")
print(classification_report(Y_train, Y_train_pred))

# Evaluate on the test set
Y_test_pred = best_rf.predict(X_test)
print("Classification Report for category_stromal_34 (Test Set):")
print(classification_report(Y_test, Y_test_pred))


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'max_depth': 27, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 14, 'n_estimators': 196}
Cross-validation F1 scores on training set: [0.54395277 0.54526985 0.47210822 0.48580817 0.46648453]
Average cross-validation F1 score on training set: 0.502724707792862
Classification Report for category_stromal_34 (Train Set):
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00       122
           0       0.98      1.00      0.99       124
           1       1.00      0.98      0.99       113

    accuracy                           0.99       359
   macro avg       0.99      0.99      0.99       359
weighted avg       0.99      0.99      0.99       359

Classification Report for category_stromal_34 (Test Set):
              precision    recall  f1-score   support

          -1       0.55      0.64      0.59        28
           0       0.52     

In [37]:
X = pd.read_csv('Top_150_features_REF.csv', header=None)


In [24]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
X = data.values
rf = RandomForestClassifier()
rfe = RFE(rf, n_features_to_select=50)  # Choose top 100 features
X = rfe.fit_transform(X, Y)


In [28]:
# save X as Top_100_features_REF
np.savetxt("Top_50_features_REF_stromal_34.csv", X, delimiter=",")

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from scipy.stats import randint

# X = data.values
X 


# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter distribution (use distributions for random search)
param_dist = {
    'n_estimators': randint(100, 150),            # Fewer trees may generalize better
    'max_depth': randint(8, 15),                 # Lower depth can prevent overfitting
    'min_samples_split': randint(20, 25),         # Increase split to limit tree growth
    'min_samples_leaf': randint(10, 15),            # Require more samples per leaf
    'max_features': ['sqrt', 'log2']
}


# Perform Randomized Search with cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=80, 
                                   cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, Y_train)

# Get the best model from Randomized Search
best_rf = random_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Cross-validation on the training set with the best model
cv_scores = cross_val_score(best_rf, X_train, Y_train, cv=5, scoring='f1_weighted')
print("Cross-validation F1 scores on training set:", cv_scores)
print("Average cross-validation F1 score on training set:", np.mean(cv_scores))

# Fit the model on the training data for the final evaluation on test and train sets
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
print("Classification Report with 50 features (Train Set):")
print(classification_report(Y_train, Y_train_pred))

# Evaluate on the test set
Y_test_pred = best_rf.predict(X_test)
print("Classification Report 50 features (Test Set):")
print(classification_report(Y_test, Y_test_pred))


Fitting 5 folds for each of 80 candidates, totalling 400 fits


Best Hyperparameters: {'max_depth': 14, 'max_features': 'sqrt', 'min_samples_leaf': 13, 'min_samples_split': 21, 'n_estimators': 132}
Cross-validation F1 scores on training set: [0.54241049 0.55413355 0.45965992 0.54186761 0.45534618]
Average cross-validation F1 score on training set: 0.510683551468428
Classification Report with 50 features (Train Set):
              precision    recall  f1-score   support

          -1       0.77      0.74      0.75       122
           0       0.67      0.74      0.70       124
           1       0.72      0.66      0.69       113

    accuracy                           0.72       359
   macro avg       0.72      0.71      0.72       359
weighted avg       0.72      0.72      0.72       359

Classification Report 50 features (Test Set):
              precision    recall  f1-score   support

          -1       0.55      0.61      0.58        28
           0       0.58      0.60      0.59        25
           1       0.70      0.62      0.66        37


In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd

# Assuming X and Y_category_b_12 are already defined
# X = data.values
# Y_category_b_12 = data_target['category_b_12'].values

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameter grid (specific values for each hyperparameter)
param_grid = {
    'n_estimators': list(range(120, 141, 2)),        # Every integer between 100 and 150 with step 10
    'max_depth': list(range(8, 16)),                  # Every integer between 8 and 15
    'min_samples_split': list(range(20, 26)),         # Every integer between 20 and 25
    'min_samples_leaf': list(range(10, 16)),          # Every integer between 10 and 15
    'max_features': ['sqrt', 'log2']                  # Options for max features
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1, verbose=1)
grid_search.fit(X_train, Y_train)

# Get the best model from Grid Search
best_rf = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Cross-validation on the training set with the best model
cv_scores = cross_val_score(best_rf, X_train, Y_train, cv=5, scoring='f1_weighted')
print("Cross-validation F1 scores on training set:", cv_scores)
print("Average cross-validation F1 score on training set:", np.mean(cv_scores))

# Fit the model on the training data for the final evaluation on test and train sets
best_rf.fit(X_train, Y_train)

# Evaluate on the training set
Y_train_pred = best_rf.predict(X_train)
print("Classification Report with 50 features (Train Set):")
print(classification_report(Y_train, Y_train_pred))

# Evaluate on the test set
Y_test_pred = best_rf.predict(X_test)
print("Classification Report 50 features (Test Set):")
print(classification_report(Y_test, Y_test_pred))


Fitting 5 folds for each of 6336 candidates, totalling 31680 fits
Best Hyperparameters: {'max_depth': 8, 'max_features': 'sqrt', 'min_samples_leaf': 10, 'min_samples_split': 25, 'n_estimators': 140}
Cross-validation F1 scores on training set: [0.54265873 0.5317864  0.5014315  0.54341182 0.45401102]
Average cross-validation F1 score on training set: 0.5146598945070727
Classification Report with 50 features (Train Set):
              precision    recall  f1-score   support

          -1       0.79      0.76      0.78       122
           0       0.71      0.74      0.73       124
           1       0.75      0.75      0.75       113

    accuracy                           0.75       359
   macro avg       0.75      0.75      0.75       359
weighted avg       0.75      0.75      0.75       359

Classification Report 50 features (Test Set):
              precision    recall  f1-score   support

          -1       0.59      0.61      0.60        28
           0       0.52      0.52      0.5