In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [6]:
# Load data from cancer stages
expression_matrix = pd.read_csv("../data/cancer_stage/fpkm_matrix.csv", index_col=0)
significant_genes = pd.read_csv("../data/cancer_stage/significant_genes.csv", index_col=0)

# Separate phenotype labels
phenotype = expression_matrix["phenotype"]
expression_matrix = expression_matrix.drop(columns=["phenotype"])

# Select significant genes
significant_gene_names = significant_genes.index
sig_exp_matrix = expression_matrix[significant_gene_names.intersection(expression_matrix.columns)]

top_var_genes_data = sig_exp_matrix
gene_dict = {i: col_name for i, col_name in enumerate(top_var_genes_data.columns)}
top_var_genes_data["phenotype"] = phenotype.values

stage1_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage1']
stage2_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage2']
stage3_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage3']
stage4_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage4']

stage1_top_var_genes_data = stage1_top_var_genes_data.drop(columns=["phenotype"])
stage2_top_var_genes_data = stage2_top_var_genes_data.drop(columns=["phenotype"])
stage3_top_var_genes_data = stage3_top_var_genes_data.drop(columns=["phenotype"])
stage4_top_var_genes_data = stage4_top_var_genes_data.drop(columns=["phenotype"])


# Split the data into training and testing sets for stage 1
stage1_train, stage1_test = train_test_split(
    stage1_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 2
stage2_train, stage2_test = train_test_split(
    stage2_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 3
stage3_train, stage3_test = train_test_split(
    stage3_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 4
stage4_train, stage4_test = train_test_split(
    stage4_top_var_genes_data, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_var_genes_data["phenotype"] = phenotype.values


In [7]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.model_selection import GridSearchCV

train_features = np.vstack([stage1_train, stage2_train, stage3_train, stage4_train])
train_labels = np.concatenate([
    np.full(stage1_train.shape[0], 1),  # Label 0 for SARC
    np.full(stage2_train.shape[0], 2),  # Label 1 for ESCA
    np.full(stage3_train.shape[0], 3),   # Label 2 for PCPG
    np.full(stage4_train.shape[0], 4)
])

test_features = np.vstack([stage1_test, stage2_test, stage3_test, stage4_test])
test_labels = np.concatenate([
    np.full(stage1_test.shape[0], 1),  # Label 0 for SARC
    np.full(stage2_test.shape[0], 2),  # Label 1 for ESCA
    np.full(stage3_test.shape[0], 3),
    np.full(stage4_test.shape[0], 4)   # Label 2 for PCPG
])


# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4], 
    'oob_score': [True]         # Minimum samples at a leaf node
}

# Perform Grid Search with Cross-Validation for Hyperparameter Tuning
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(train_features, train_labels)

# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Train the best model on training data
best_rf_model.fit(train_features, train_labels)

# Evaluate the best model
train_predictions = best_rf_model.predict(train_features)
test_predictions = best_rf_model.predict(test_features)
test_probabilities = best_rf_model.predict_proba(test_features)

train_accuracy = accuracy_score(train_labels, train_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)
classification_rep = classification_report(test_labels, test_predictions)
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Calculate metrics
mse = mean_squared_error(test_labels, np.argmax(test_probabilities, axis=1))
logloss = log_loss(test_labels, test_probabilities)

# Display results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Log Loss: {logloss:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

# Display OOB score
if hasattr(best_rf_model, 'oob_score_') and best_rf_model.oob_score_:
    print(f"OOB Score: {best_rf_model.oob_score_:.4f}")

Best parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50, 'oob_score': True}
Training Accuracy: 0.9587
Test Accuracy: 0.4949
Mean Squared Error (MSE): 1.6768
Log Loss: 1.1777

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.14      0.25         7
           2       0.49      0.57      0.52        30
           3       0.50      0.26      0.34        27
           4       0.49      0.69      0.57        35

    accuracy                           0.49        99
   macro avg       0.62      0.41      0.42        99
weighted avg       0.53      0.49      0.47        99


Confusion Matrix:
 [[ 1  5  0  1]
 [ 0 17  2 11]
 [ 0  7  7 13]
 [ 0  6  5 24]]
OOB Score: 0.3721


In [8]:
# Extract cross-validation results
cv_results = grid_search.cv_results_

# Convert results to a DataFrame for easier inspection
cv_results_df = pd.DataFrame(cv_results)

# Select and display important columns
columns_to_display = [
    'param_n_estimators',
    'param_max_depth',
    'param_min_samples_split',
    'param_min_samples_leaf',
    'mean_test_score',  # Average CV accuracy score for each parameter combination
    'std_test_score',   # Standard deviation of the CV scores
    'split0_test_score', 
    'split1_test_score', 
    'split2_test_score', 
    'split3_test_score', 
    'split4_test_score',
     'rank_test_score'
]
# Filter the results and sort by rank_test_score
cv_results_summary = cv_results_df[columns_to_display]
cv_results_summary_sorted = cv_results_summary.sort_values(by='rank_test_score')

# Save all results to a CSV file
cv_results_summary_sorted.to_csv('results/no_tda_cv_results.csv', index=False)

top_10_results = cv_results_summary_sorted.head(10)

# Display the top 10 results in this notebook environment
print("Top 10 CV Results:")
print(top_10_results)

Top 10 CV Results:
    param_n_estimators param_max_depth  param_min_samples_split  \
21                  50            None                        5   
72                  50              20                        2   
18                  50            None                        2   
75                  50              20                        5   
65                 200              20                        2   
11                 200            None                        2   
31                 100              10                        5   
30                  50              10                        5   
38                 200              10                        2   
53                 200              10                       10   

    param_min_samples_leaf  mean_test_score  std_test_score  \
21                       4         0.407959        0.060407   
72                       4         0.407959        0.060407   
18                       4         0.407959        0.0