In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os 
import sys

home_dir = "../"
src_path = os.path.join(home_dir, "src")

# Add the `src` folder to the Python path
sys.path.append(src_path)



In [2]:
# Load data
expression_matrix = pd.read_csv("../data/cancer_type/fpkm_matrix.csv", index_col=0)
significant_genes = pd.read_csv("../data/cancer_type/significant_genes.csv", index_col=0)

# Separate phenotype labels
phenotype = expression_matrix["phenotype"]
expression_matrix = expression_matrix.drop(columns=["phenotype"])

# Select significant genes
significant_gene_names = significant_genes.index
sig_exp_matrix = expression_matrix[significant_gene_names.intersection(expression_matrix.columns)]

top_var_genes_data = sig_exp_matrix
gene_dict = {i: col_name for i, col_name in enumerate(top_var_genes_data.columns)}
top_var_genes_data["phenotype"] = phenotype.values

sarc_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-SARC']
esca_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-ESCA']
pcpg_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-PCPG']

sarc_top_var_genes_data = sarc_top_var_genes_data.drop(columns=["phenotype"])
esca_top_var_genes_data = esca_top_var_genes_data.drop(columns=["phenotype"])
pcpg_top_var_genes_data = pcpg_top_var_genes_data.drop(columns=["phenotype"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_var_genes_data["phenotype"] = phenotype.values


In [3]:
# Split the data into training and testing sets for SARC
sarc_train, sarc_test = train_test_split(
    sarc_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for ESCA
esca_train, esca_test = train_test_split(
    esca_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for PCPG
pcpg_train, pcpg_test = train_test_split(
    pcpg_top_var_genes_data, test_size=0.2, random_state=42
)

In [4]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, log_loss
from sklearn.model_selection import GridSearchCV

train_features = np.vstack([sarc_train, esca_train, pcpg_train])
train_labels = np.concatenate([
    np.full(sarc_train.shape[0], 0),  # Label 0 for SARC
    np.full(esca_train.shape[0], 1),  # Label 1 for ESCA
    np.full(pcpg_train.shape[0], 2)   # Label 2 for PCPG
])

test_features = np.vstack([sarc_test, esca_test, pcpg_test])
test_labels = np.concatenate([
    np.full(sarc_test.shape[0], 0),  # Label 0 for SARC
    np.full(esca_test.shape[0], 1),  # Label 1 for ESCA
    np.full(pcpg_test.shape[0], 2)   # Label 2 for PCPG
])


# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],         # Number of trees in the forest
    'max_depth': [None, 10, 20],            # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],        # Minimum samples to split an internal node
    'min_samples_leaf': [1, 2, 4], 
    'oob_score': [True]         # Minimum samples at a leaf node
}

# Perform Grid Search with Cross-Validation for Hyperparameter Tuning
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy'
)
grid_search.fit(train_features, train_labels)

# Get the best model from Grid Search
best_rf_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)

# Train the best model on training data
best_rf_model.fit(train_features, train_labels)

# Evaluate the best model
train_predictions = best_rf_model.predict(train_features)
test_predictions = best_rf_model.predict(test_features)
test_probabilities = best_rf_model.predict_proba(test_features)

train_accuracy = accuracy_score(train_labels, train_predictions)
test_accuracy = accuracy_score(test_labels, test_predictions)
classification_rep = classification_report(test_labels, test_predictions)
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Calculate metrics
mse = mean_squared_error(test_labels, np.argmax(test_probabilities, axis=1))
logloss = log_loss(test_labels, test_probabilities)

# Display results
print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Log Loss: {logloss:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

# Display OOB score
if hasattr(best_rf_model, 'oob_score_') and best_rf_model.oob_score_:
    print(f"OOB Score: {best_rf_model.oob_score_:.4f}")

KeyboardInterrupt: 