In [1]:
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from scipy.stats import randint, uniform
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD_90percent_sample.csv')
data.index = data['PatientID']
# drop the 'PatientID' column
data = data.drop('PatientID', axis=1)

In [3]:
data_target = pd.read_csv('~/Desktop/project_data_new/target_768_avg_expanded.csv')
data_target.index = data_target['Unnamed: 0']
data_target = data_target.drop(['Unnamed: 0'], axis = 1)
# only keep the columns with category in the name
data_target = data_target.loc[:, data_target.columns.str.contains('category')]

In [4]:
data_test = pd.read_csv('~/Desktop/project_data_new/embedding_768_TCGA_COAD_10percent_sample.csv')
data_test.index = data_test['PatientID']
# drop the 'PatientID' column
data_test = data_test.drop('PatientID', axis=1)
data_test = data_test[data_test.index.isin(data_target.index)]

In [5]:
data = data[data.index.isin(data_target.index)]
data.shape

(407, 768)

In [6]:
# Align the targets for training and unseen test sets
data_target_train = data_target.loc[data.index]
data_target_unseen = data_target.loc[data_test.index]
data_target_train.shape

(407, 241)

In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform

# Initialize the XGBoost Classifier
xgb_clf = xgb.XGBClassifier(objective='multi:softmax', random_state=42)

# Extract the feature values from data
X = data.values
X_test_unseen = data_test.values
# Set the chunk size (number of target variables per part)
chunk_size = 40

# Loop through target columns in chunks
for chunk_start in range(200, 241, chunk_size):
    # Get the current chunk of target columns
    chunk_end = min(chunk_start + chunk_size, 241)
    target_chunk = data_target.columns[chunk_start:chunk_end]
    
    # Initialize list to store results for the current chunk
    chunk_results = []

    # Open a file to write classification reports for the current chunk
    report_file_path = f"/home/qiuaodon/Desktop/CRC_image/Best_100_features_XGboost_90percents/Classification_Reports_100features_XGB_part_{chunk_start}_{chunk_end}.txt"
    with open(report_file_path, "w") as report_file:
        # Loop through each target column in the current chunk
        for target_col in target_chunk:
            # Extract the target column for the current category
            Y = data_target_train[target_col].values

            # Encode the labels if necessary
            label_encoder = LabelEncoder()
            Y = label_encoder.fit_transform(Y)
            
            # Fit XGBoost to calculate feature importance
            xgb_clf.fit(X, Y)

            # Get feature importances and select the top 100 indices
            importances = xgb_clf.feature_importances_
            top_100_indices = np.argsort(importances)[-100:]

            # Select top 100 features
            X_selected = X[:, top_100_indices]
            X_test_unseen_selected = X_test_unseen[:, top_100_indices]

            # Split the data into training and testing sets
            X_train, X_test, Y_train, Y_test = train_test_split(
                X_selected, Y, test_size=0.2, random_state=42, stratify=Y
            )
            
            # Define the hyperparameter distribution for RandomizedSearchCV
            param_dist = {
                'max_depth': randint(6, 13),
                'min_child_weight': randint(1, 10),
                'subsample': uniform(0.9, 0.1),
                'colsample_bytree': uniform(0.9, 0.1),
                'learning_rate': uniform(0.01, 0.05),
                'n_estimators': randint(1000, 1500),
                'gamma': uniform(2, 3),
            }
            
            # Perform Randomized Search with cross-validation
            random_search = RandomizedSearchCV(
                estimator=xgb_clf,
                param_distributions=param_dist,
                n_iter=160, 
                cv=5,
                scoring='f1_weighted',
                n_jobs=-1,
                verbose=1,
                random_state=42
            )
            random_search.fit(X_train, Y_train)
            
            # Get the best model from Randomized Search
            best_xgb = random_search.best_estimator_
            best_params = random_search.best_params_
            
            # Evaluate on the training set
            Y_train_pred = best_xgb.predict(X_train)
            train_report_dict = classification_report(Y_train, Y_train_pred, output_dict=True)
            train_report = classification_report(Y_train, Y_train_pred)
            
            # Evaluate on the test set
            Y_test_pred = best_xgb.predict(X_test)
            test_report_dict = classification_report(Y_test, Y_test_pred, output_dict=True)
            test_report = classification_report(Y_test, Y_test_pred)
            
            # Evaluate on the unseen test data
            Y_test_unseen_col = data_target_unseen[target_col].values
            Y_test_unseen_col = label_encoder.transform(Y_test_unseen_col)
            Y_test_unseen_pred = best_xgb.predict(X_test_unseen_selected)
            unseen_test_report_dict = classification_report(Y_test_unseen_col, Y_test_unseen_pred, output_dict=True)
            unseen_test_report = classification_report(Y_test_unseen_col, Y_test_unseen_pred)

            # Write classification reports to the file
            report_file.write(f"Classification Report for {target_col} (Train Set):\n")
            report_file.write(train_report)
            report_file.write("\n")
            report_file.write(f"Classification Report for {target_col} (Test Set):\n")
            report_file.write(test_report)
            report_file.write("\n")
            report_file.write(f"Classification Report for {target_col} (Unseen Test Set):\n")
            report_file.write(unseen_test_report)
            report_file.write("\n\n" + "="*80 + "\n\n")
            
            # Calculate overall test metrics
            test_precision = precision_score(Y_test, Y_test_pred, average='weighted')
            test_recall = recall_score(Y_test, Y_test_pred, average='weighted')
            test_accuracy = accuracy_score(Y_test, Y_test_pred)
            
            # Calculate overall unseen test metrics
            unseen_test_precision = precision_score(Y_test_unseen_col, Y_test_unseen_pred, average='weighted')
            unseen_test_recall = recall_score(Y_test_unseen_col, Y_test_unseen_pred, average='weighted')
            unseen_test_accuracy = accuracy_score(Y_test_unseen_col, Y_test_unseen_pred)

            # Extract test Class 1 metrics
            class_1_metrics = test_report_dict.get('1', {"precision": None, "recall": None, "f1-score": None})

            # Extract unseen test Class 1 metrics
            class_1_unseen_metrics = unseen_test_report_dict.get('1', {"precision": None, "recall": None, "f1-score": None})

            # Append results to chunk_results
            chunk_results.append({
                "Target Variable": target_col,
                "Test Precision": test_precision,
                "Test Recall": test_recall,
                "Test Accuracy": test_accuracy,
                "Class 1 Precision (Test)": class_1_metrics["precision"],
                "Class 1 Recall (Test)": class_1_metrics["recall"],
                "Class 1 F1-Score (Test)": class_1_metrics["f1-score"],
                "Unseen Test Precision": unseen_test_precision,
                "Unseen Test Recall": unseen_test_recall,
                "Unseen Test Accuracy": unseen_test_accuracy,
                "Class 1 Precision (Unseen Test)": class_1_unseen_metrics["precision"],
                "Class 1 Recall (Unseen Test)": class_1_unseen_metrics["recall"],
                "Class 1 F1-Score (Unseen Test)": class_1_unseen_metrics["f1-score"],
                "Best Hyperparameters": best_params
            })
    
    # Save the results for the current chunk to an Excel file
    results_df = pd.DataFrame(chunk_results)
    results_df.to_excel(
        f"/home/qiuaodon/Desktop/CRC_image/Best_100_features_XGboost_90percents/Precision_Recall_Accuracy_100features_XGB_part_{chunk_start}_{chunk_end}.xlsx",
        index=False
    )


Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 800 fits
Fitting 5 folds for each of 160 candidates, totalling 8

In [None]:

print(f"Shape of X: {X.shape}")
print(f"Length of Y: {len(Y)}")



Shape of X: (407, 768)
Length of Y: 449


In [5]:
import pandas as pd
import glob

# Specify the directory where your files are located
file_path_pattern = '/home/qiuaodon/Desktop/CRC_image/Best_100_features_XGboost_90percents/Precision_Recall_Accuracy_100features_XGB_part_*.xlsx'

# Use glob to find all matching files
all_files = glob.glob(file_path_pattern)

# Create an empty list to store DataFrames
data_frames = []

# Loop through each file and append its DataFrame to the list
for file in all_files:
    df = pd.read_excel(file)
    data_frames.append(df)

# Concatenate all DataFrames
combined_df = pd.concat(data_frames, ignore_index=True)

# Save the combined DataFrame to a new Excel file
combined_df.to_excel('/home/qiuaodon/Desktop/CRC_image/Best_100_features_XGboost_90percents/Combined_Precision_Recall_Accuracy_100features_XGB_241_targets.xlsx', index=False)
combined_df


Unnamed: 0,Target Variable,Test Precision,Test Recall,Test Accuracy,Class 1 Precision (Test),Class 1 Recall (Test),Class 1 F1-Score (Test),Unseen Test Precision,Unseen Test Recall,Unseen Test Accuracy,Class 1 Precision (Unseen Test),Class 1 Recall (Unseen Test),Class 1 F1-Score (Unseen Test),Best Hyperparameters
0,category_b_3,0.513086,0.512195,0.512195,0.448276,0.464286,0.456140,0.397756,0.357143,0.357143,0.315789,0.666667,0.428571,"{'colsample_bytree': 0.940895294441427, 'gamma..."
1,category_b_4,0.474930,0.475610,0.475610,0.461538,0.444444,0.452830,0.379630,0.380952,0.380952,0.375000,0.214286,0.272727,"{'colsample_bytree': 0.9922499381177297, 'gamm..."
2,category_b_5,0.390111,0.390244,0.390244,0.416667,0.357143,0.384615,0.357937,0.357143,0.357143,0.111111,0.100000,0.105263,"{'colsample_bytree': 0.9936729988736734, 'gamm..."
3,category_b_6,0.434490,0.439024,0.439024,0.421053,0.285714,0.340426,0.304762,0.285714,0.285714,0.000000,0.000000,0.000000,"{'colsample_bytree': 0.9790175540531206, 'gamm..."
4,category_b_7,0.531728,0.524390,0.524390,0.526316,0.357143,0.425532,0.373152,0.380952,0.380952,0.222222,0.181818,0.200000,"{'colsample_bytree': 0.9015456616528867, 'gamm..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236,category_tnk_57,0.354274,0.353659,0.353659,0.343750,0.407407,0.372881,0.479252,0.452381,0.452381,0.461538,0.461538,0.461538,"{'colsample_bytree': 0.9380890856631021, 'gamm..."
237,category_tnk_66,0.524390,0.524390,0.524390,0.500000,0.500000,0.500000,0.530423,0.404762,0.404762,0.222222,0.500000,0.307692,"{'colsample_bytree': 0.9663501769108056, 'gamm..."
238,category_tnk_67,0.424082,0.426829,0.426829,0.321429,0.321429,0.321429,0.232426,0.238095,0.238095,0.000000,0.000000,0.000000,"{'colsample_bytree': 0.9929375989127586, 'gamm..."
239,category_b_1,0.451296,0.463415,0.463415,0.347826,0.275862,0.307692,0.353619,0.285714,0.285714,0.066667,0.125000,0.086957,"{'colsample_bytree': 0.9737501248109749, 'gamm..."
