In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
import sys
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import sklearn


home_dir = "../"
src_path = os.path.join(home_dir, "src")

# Add the `src` folder to the Python path
sys.path.append(src_path)
from models import train_and_evaluate_svm, train_and_evaluate_xgboost, train_and_evaluate_nn, train_and_evaluate_lightgbm

In [2]:
# Load data
expression_matrix = pd.read_csv("../data/cancer_type/fpkm_matrix.csv", index_col=0)
significant_genes = pd.read_csv("../data/cancer_type/significant_genes.csv", index_col=0)

# Separate phenotype labels
phenotype = expression_matrix["phenotype"]
expression_matrix = expression_matrix.drop(columns=["phenotype"])

# Select significant genes
significant_gene_names = significant_genes.index
sig_exp_matrix = expression_matrix[significant_gene_names.intersection(expression_matrix.columns)]

top_var_genes_data = sig_exp_matrix
gene_dict = {i: col_name for i, col_name in enumerate(top_var_genes_data.columns)}
top_var_genes_data["phenotype"] = phenotype.values

sarc_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-SARC']
esca_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-ESCA']
pcpg_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'TCGA-PCPG']

sarc_top_var_genes_data = sarc_top_var_genes_data.drop(columns=["phenotype"])
esca_top_var_genes_data = esca_top_var_genes_data.drop(columns=["phenotype"])
pcpg_top_var_genes_data = pcpg_top_var_genes_data.drop(columns=["phenotype"])

# Split the data into training and testing sets for SARC
sarc_train, sarc_test = train_test_split(
    sarc_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for ESCA
esca_train, esca_test = train_test_split(
    esca_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for PCPG
pcpg_train, pcpg_test = train_test_split(
    pcpg_top_var_genes_data, test_size=0.2, random_state=42
)


train_features = np.vstack([sarc_train, esca_train, pcpg_train])
train_labels = np.concatenate([
    np.full(sarc_train.shape[0], 0),  # Label 0 for SARC
    np.full(esca_train.shape[0], 1),  # Label 1 for ESCA
    np.full(pcpg_train.shape[0], 2)   # Label 2 for PCPG
])

test_features = np.vstack([sarc_test, esca_test, pcpg_test])
test_labels = np.concatenate([
    np.full(sarc_test.shape[0], 0),  # Label 0 for SARC
    np.full(esca_test.shape[0], 1),  # Label 1 for ESCA
    np.full(pcpg_test.shape[0], 2)   # Label 2 for PCPG
])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_var_genes_data["phenotype"] = phenotype.values


In [3]:
# SVM 
best_svm, metrics_svm = train_and_evaluate_svm(train_features, train_labels, test_features, test_labels)

print(best_svm)
print(metrics_svm)


Best parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Training Accuracy: 0.9920
Test Accuracy: 0.9760
Mean Squared Error: 0.1040
Log Loss: 0.0923

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97        52
           1       1.00      1.00      1.00        37
           2       0.97      0.94      0.96        36

    accuracy                           0.98       125
   macro avg       0.98      0.98      0.98       125
weighted avg       0.98      0.98      0.98       125


Confusion Matrix:
 [[51  0  1]
 [ 0 37  0]
 [ 2  0 34]]
SVC(C=100, probability=True, random_state=42)
{'accuracy': 0.976, 'mse': 0.104, 'logloss': 0.09232110561104115}


In [4]:
# XGBoost
best_lgb, metrics_lgb = train_and_evaluate_lightgbm(train_features, train_labels, test_features, test_labels)

print(best_lgb)
print(metrics_lgb)




Best parameters: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 200, 'num_leaves': 31}
Training Accuracy: 1.0000
Test Accuracy: 0.9760
Mean Squared Error: 0.0960
Log Loss: 0.1320

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97        52
           1       1.00      1.00      1.00        37
           2       1.00      0.92      0.96        36

    accuracy                           0.98       125
   macro avg       0.98      0.97      0.98       125
weighted avg       0.98      0.98      0.98       125


Confusion Matrix:
 [[52  0  0]
 [ 0 37  0]
 [ 3  0 33]]
LGBMClassifier(learning_rate=0.01, n_estimators=200, random_state=42,
               verbosity=-1)
{'accuracy': 0.976, 'classification_report': '              precision    recall  f1-score   support\n\n           0       0.95      1.00      0.97        52\n           1       1.00      1.00      1.00        37\n           2       1.00      0.92   



In [5]:
best_nn, metrics_nn = train_and_evaluate_nn(train_features, train_labels, test_features, test_labels)

Best parameters: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate': 'constant'}
Training Accuracy: 1.0000
Test Accuracy: 0.9920
Mean Squared Error: 0.0320
Log Loss: 0.0285

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99        52
           1       1.00      1.00      1.00        37
           2       1.00      0.97      0.99        36

    accuracy                           0.99       125
   macro avg       0.99      0.99      0.99       125
weighted avg       0.99      0.99      0.99       125


Confusion Matrix:
 [[52  0  0]
 [ 0 37  0]
 [ 1  0 35]]


In [6]:
sklearn.__version__

'1.6.0'