In [14]:
import numpy as np
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import keras
from sklearn.linear_model import LogisticRegression

In [26]:
data_path = r"C:\Users\Win\Documents\Github Workspace\Machine_Learning\Project\Train Data.csv"
data = pd.read_csv(data_path, delimiter=",")

display(data)

# Separate features and label
X = data.iloc[:, :-1]  # All columns except the last
y = data.iloc[:, -1]   # The last column is assumed to be the label


# # Step 1: Split into 50% train and 50% temp with stratification
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.5, random_state=42, stratify=y)

# # Step 2: Split temp into 30% CV and 20% test (from 50% temp = 60/40 split)
X_cv, X_test, y_cv, y_test = train_test_split(
    X_temp, y_temp, test_size=0.4, random_state=42, stratify=y_temp)

# Confirm split sizes
print(f"Train size: {len(X_train)} samples")
print(f"CV size: {len(X_cv)} samples")
print(f"Test size: {len(X_test)} samples")

np.bincount(y_train)

Unnamed: 0,201506_at,205486_at,216638_s_at,221619_s_at,221672_s_at,35148_at,Characteristics..Relapse..Metastasis.
0,-1.194502,-1.368737,-0.717225,-1.531878,0.389693,0.000420,0
1,0.693175,-0.590901,-0.546195,0.037206,-0.128537,0.357764,0
2,0.333743,-0.739208,0.209520,0.644257,0.195022,0.387819,1
3,1.000242,-0.090453,1.383431,0.564629,0.031483,-0.001509,0
4,0.610597,-0.321874,-0.335347,-0.455115,-0.401555,-0.026012,1
...,...,...,...,...,...,...,...
522,-1.250697,-0.169552,-0.780680,-1.270128,-2.083823,-1.229551,0
523,0.797979,-0.064454,0.313360,-0.489885,-1.616572,0.942426,0
524,-0.338362,2.031796,-0.930988,0.944626,-1.323058,0.286487,0
525,0.050661,0.274132,2.433399,-0.403539,-1.791106,-0.147379,0


Train size: 263 samples
CV size: 158 samples
Test size: 106 samples


array([204,  59])

In [33]:
import pandas as pd
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, recall_score, precision_score,
    f1_score, accuracy_score
)

# Define gene names
gene_names = ['201506_at', '205486_at', '216638_s_at', '221619_s_at', '221672_s_at', '35148_at']

# Convert arrays to DataFrames
X_train = pd.DataFrame(X_train, columns=gene_names)
X_cv = pd.DataFrame(X_cv, columns=gene_names)
X_test = pd.DataFrame(X_test, columns=gene_names)

# Store all results
all_results = []

# Detect classification type
n_classes = len(set(y_train))

# Loop through all feature combinations
for k in range(1, len(gene_names) + 1):
    for comb in combinations(gene_names, k):
        features = list(comb)

        # Select features
        X_train_sel = X_train[features]
        X_cv_sel = X_cv[features]
        X_test_sel = X_test[features]

        # Train model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train_sel, y_train)

        # Predict
        y_train_pred = model.predict(X_train_sel)
        y_cv_pred = model.predict(X_cv_sel)
        y_test_pred = model.predict(X_test_sel)
        y_test_proba = model.predict_proba(X_test_sel)

        # AUC handling
        try:
            if n_classes > 2:
                auc = roc_auc_score(y_test, y_test_proba, multi_class='ovr', average='macro')
            else:
                auc = roc_auc_score(y_test, y_test_proba[:, 1])
        except:
            auc = float('-inf')

        # Other metrics
        recall = recall_score(y_test, y_test_pred, average='macro', zero_division=0)
        precision = precision_score(y_test, y_test_pred, average='macro', zero_division=0)
        f1 = f1_score(y_test, y_test_pred, average='macro', zero_division=0)
        train_acc = accuracy_score(y_train, y_train_pred)
        cv_acc = accuracy_score(y_cv, y_cv_pred)
        test_acc = accuracy_score(y_test, y_test_pred)

        all_results.append({
            "genes": ' + '.join(features),
            "auc": auc,
            "recall": recall,
            "precision": precision,
            "f1": f1,
            "train_acc": train_acc,
            "cv_acc": cv_acc,
            "test_acc": test_acc
        })

# Create results DataFrame
results_df = pd.DataFrame(all_results)

# Get best model for each metric
summary_rows = []
metrics = {
    "Highest AUC": "auc",
    "Highest Recall": "recall",
    "Highest Precision": "precision",
    "Highest F1-score": "f1",
    "Highest Train Accuracy": "train_acc",
    "Highest CV Accuracy": "cv_acc",
    "Highest Test Accuracy": "test_acc"
}

for label, metric in metrics.items():
    best_row = results_df.loc[results_df[metric].idxmax()]
    summary_rows.append({
        "Metric": label,
        "Genes": best_row["genes"],
        "AUC": best_row["auc"],
        "Recall": best_row["recall"],
        "Precision": best_row["precision"],
        "F1-score": best_row["f1"],
        "Train Accuracy": best_row["train_acc"],
        "CV Accuracy": best_row["cv_acc"],
        "Test Accuracy": best_row["test_acc"]
    })

# Save summary to CSV
summary_df = pd.DataFrame(summary_rows)
summary_df.to_csv("best_models_summary.csv", index=False)

print("✅ Summary saved to 'best_models_summary.csv'")


✅ Summary saved to 'best_models_summary.csv'
