In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import os
import sys
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


home_dir = "../"
src_path = os.path.join(home_dir, "src")

# Add the `src` folder to the Python path
sys.path.append(src_path)
from models import train_and_evaluate_svm, train_and_evaluate_xgboost, train_and_evaluate_nn, train_and_evaluate_lightgbm

In [2]:
# Load data from cancer stages
expression_matrix = pd.read_csv("../data/cancer_stage/fpkm_matrix.csv", index_col=0)
significant_genes = pd.read_csv("../data/cancer_stage/significant_genes.csv", index_col=0)

# Separate phenotype labels
phenotype = expression_matrix["phenotype"]
expression_matrix = expression_matrix.drop(columns=["phenotype"])

# Select significant genes
significant_gene_names = significant_genes.index
sig_exp_matrix = expression_matrix[significant_gene_names.intersection(expression_matrix.columns)]

top_var_genes_data = sig_exp_matrix
gene_dict = {i: col_name for i, col_name in enumerate(top_var_genes_data.columns)}
top_var_genes_data["phenotype"] = phenotype.values

stage1_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage1']
stage2_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage2']
stage3_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage3']
stage4_top_var_genes_data = top_var_genes_data[top_var_genes_data['phenotype'] == 'Stage4']

stage1_top_var_genes_data = stage1_top_var_genes_data.drop(columns=["phenotype"])
stage2_top_var_genes_data = stage2_top_var_genes_data.drop(columns=["phenotype"])
stage3_top_var_genes_data = stage3_top_var_genes_data.drop(columns=["phenotype"])
stage4_top_var_genes_data = stage4_top_var_genes_data.drop(columns=["phenotype"])


# Split the data into training and testing sets for stage 1
stage1_train, stage1_test = train_test_split(
    stage1_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 2
stage2_train, stage2_test = train_test_split(
    stage2_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 3
stage3_train, stage3_test = train_test_split(
    stage3_top_var_genes_data, test_size=0.2, random_state=42
)

# Split the data into training and testing sets for stage 4
stage4_train, stage4_test = train_test_split(
    stage4_top_var_genes_data, test_size=0.2, random_state=42
)

train_features = np.vstack([stage1_train, stage2_train, stage3_train, stage4_train])
train_labels = np.concatenate([
    np.full(stage1_train.shape[0], 1),  # Label 0 for SARC
    np.full(stage2_train.shape[0], 2),  # Label 1 for ESCA
    np.full(stage3_train.shape[0], 3),   # Label 2 for PCPG
    np.full(stage4_train.shape[0], 4)
])

test_features = np.vstack([stage1_test, stage2_test, stage3_test, stage4_test])
test_labels = np.concatenate([
    np.full(stage1_test.shape[0], 1),  # Label 0 for SARC
    np.full(stage2_test.shape[0], 2),  # Label 1 for ESCA
    np.full(stage3_test.shape[0], 3),
    np.full(stage4_test.shape[0], 4)   # Label 2 for PCPG
])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_var_genes_data["phenotype"] = phenotype.values


In [3]:
# SVM 
best_svm, metrics_svm = train_and_evaluate_svm(train_features, train_labels, test_features, test_labels, param_grid = {
            'C': [0.1],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale']
        })

print(best_svm)
print(metrics_svm)


Best parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Training Accuracy: 0.3566
Test Accuracy: 0.3535
Mean Squared Error: 1.1010
Log Loss: 1.2602

Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00         7
           2       0.00      0.00      0.00        30
           3       0.00      0.00      0.00        27
           4       0.35      1.00      0.52        35

    accuracy                           0.35        99
   macro avg       0.09      0.25      0.13        99
weighted avg       0.12      0.35      0.18        99


Confusion Matrix:
 [[ 0  0  0  7]
 [ 0  0  0 30]
 [ 0  0  0 27]
 [ 0  0  0 35]]
SVC(C=0.1, probability=True, random_state=42)
{'accuracy': 0.35353535353535354, 'mse': 1.101010101010101, 'logloss': 1.2601773933866747}


In [4]:
# XGBoost
best_lgb, metrics_lgb = train_and_evaluate_lightgbm(train_features, train_labels, test_features, test_labels)

print(best_lgb)
print(metrics_lgb)




Best parameters: {'learning_rate': 0.01, 'max_depth': -1, 'n_estimators': 100, 'num_leaves': 31}
Training Accuracy: 0.9690
Test Accuracy: 0.3434
Mean Squared Error: 2.3737
Log Loss: 1.2696

Classification Report:
               precision    recall  f1-score   support

           1       1.00      0.14      0.25         7
           2       0.30      0.37      0.33        30
           3       0.26      0.22      0.24        27
           4       0.42      0.46      0.44        35

    accuracy                           0.34        99
   macro avg       0.49      0.30      0.31        99
weighted avg       0.38      0.34      0.34        99


Confusion Matrix:
 [[ 1  3  0  3]
 [ 0 11  9 10]
 [ 0 12  6  9]
 [ 0 11  8 16]]
LGBMClassifier(learning_rate=0.01, random_state=42, verbosity=-1)
{'accuracy': 0.3434343434343434, 'classification_report': '              precision    recall  f1-score   support\n\n           1       1.00      0.14      0.25         7\n           2       0.30      0.37



In [5]:
best_nn, metrics_nn= train_and_evaluate_nn(train_features, train_labels, test_features, test_labels)

Best parameters: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
Training Accuracy: 0.9793
Test Accuracy: 0.3737
Mean Squared Error: 2.3333
Log Loss: 2.0514

Classification Report:
               precision    recall  f1-score   support

           1       0.29      0.29      0.29         7
           2       0.43      0.33      0.38        30
           3       0.31      0.33      0.32        27
           4       0.40      0.46      0.43        35

    accuracy                           0.37        99
   macro avg       0.36      0.35      0.35        99
weighted avg       0.38      0.37      0.37        99


Confusion Matrix:
 [[ 2  1  1  3]
 [ 2 10  9  9]
 [ 1  5  9 12]
 [ 2  7 10 16]]
