In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Load data from cancer stages
expression_matrix = pd.read_csv("../data/treatment_response/fpkm_matrix.csv", index_col=0)
significant_genes = pd.read_csv("../data/treatment_response/significant_genes.csv", index_col=0)

# Separate phenotype labels
phenotype = expression_matrix["phenotype"]
expression_matrix = expression_matrix.drop(columns=["phenotype"])

# Select significant genes
significant_gene_names = significant_genes.index
sig_exp_matrix = expression_matrix[significant_gene_names.intersection(expression_matrix.columns)]

top_var_genes_data = sig_exp_matrix
gene_dict = {i: col_name for i, col_name in enumerate(top_var_genes_data.columns)}
top_var_genes_data["phenotype"] = phenotype.values

resistant_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Resistance']
sensitive_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Sensitive']

resistant_top_var_genes_data = resistant_top_var_genes_data.drop(columns=["phenotype"])
sensitive_top_var_genes_data = sensitive_top_var_genes_data.drop(columns=["phenotype"])

# Split the data into training and testing sets for stage 1
resistant_train, resistant_test = train_test_split(
    resistant_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 2
sensitive_train, sensitive_test = train_test_split(
    sensitive_top_var_genes_data, test_size=0.2, random_state=42
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_var_genes_data["phenotype"] = phenotype.values


In [3]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.model_selection import GridSearchCV

train_features = np.vstack([resistant_train, sensitive_train])
train_labels = np.concatenate([
    np.full(resistant_train.shape[0], 1),  # Label 0 for resistent
    np.full(sensitive_train.shape[0], 2)  # Label 1 for sensitive
])

test_features = np.vstack([resistant_test, sensitive_test])
test_labels = np.concatenate([
    np.full(resistant_test.shape[0], 1),  # Label 0 for resistent
    np.full(sensitive_test.shape[0], 2)  # Label 2 for sensitive
])


# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4], 
    'oob_score': [True]         # Minimum samples at a leaf node
}

# Perform Grid Search with Cross-Validation for Hyperparameter Tuning
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(train_features, train_labels)

# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Train the best model on training data
best_rf_model.fit(train_features, train_labels)

# Evaluate the best model
train_predictions = best_rf_model.predict(train_features)
test_predictions = best_rf_model.predict(test_features)
test_probabilities = best_rf_model.predict_proba(test_features)

train_accuracy = accuracy_score(train_labels, train_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)
classification_rep = classification_report(test_labels, test_predictions)
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Calculate metrics
mse = mean_squared_error(test_labels, np.argmax(test_probabilities, axis=1))
logloss = log_loss(test_labels, test_probabilities)

# Display results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Log Loss: {logloss:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

# Display OOB score
if hasattr(best_rf_model, 'oob_score_') and best_rf_model.oob_score_:
    print(f"OOB Score: {best_rf_model.oob_score_:.4f}")

Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200, 'oob_score': True}
Training Accuracy: 0.9778
Test Accuracy: 0.7500
Mean Squared Error (MSE): 0.9674
Log Loss: 0.4696

Classification Report:
               precision    recall  f1-score   support

           1       0.44      0.18      0.26        22
           2       0.78      0.93      0.85        70

    accuracy                           0.75        92
   macro avg       0.61      0.56      0.55        92
weighted avg       0.70      0.75      0.71        92


Confusion Matrix:
 [[ 4 18]
 [ 5 65]]
OOB Score: 0.7895


In [4]:
# Extract cross-validation results
cv_results = grid_search.cv_results_

# Convert results to a DataFrame for easier inspection
cv_results_df = pd.DataFrame(cv_results)

# Select and display important columns
columns_to_display = [
    'param_n_estimators',
    'param_max_depth',
    'param_min_samples_split',
    'param_min_samples_leaf',
    'mean_test_score',  # Average CV accuracy score for each parameter combination
    'std_test_score',   # Standard deviation of the CV scores
    'split0_test_score', 
    'split1_test_score', 
    'split2_test_score', 
    'split3_test_score', 
    'split4_test_score',
     'rank_test_score'
]
# Filter the results and sort by rank_test_score
cv_results_summary = cv_results_df[columns_to_display]
cv_results_summary_sorted = cv_results_summary.sort_values(by='rank_test_score')

# Save all results to a CSV file
cv_results_summary_sorted.to_csv('results/no_tda_cv_results.csv', index=False)

top_10_results = cv_results_summary_sorted.head(10)

# Display the top 10 results in this notebook environment
print("Top 10 CV Results:")
print(top_10_results)

Top 10 CV Results:
    param_n_estimators param_max_depth  param_min_samples_split  \
62                 200              20                       10   
8                  200            None                       10   
56                 200              20                        2   
2                  200            None                        2   
70                 100              20                       10   
16                 100            None                       10   
32                 200              10                        5   
38                 200              10                        2   
11                 200            None                        2   
65                 200              20                        2   

    param_min_samples_leaf  mean_test_score  std_test_score  \
62                       1         0.828158        0.036150   
8                        1         0.828158        0.036150   
56                       1         0.825457        0.0