In [None]:
# PrimaryDiseaseDetector.ipynb

# Importing necessary libraries

# Standard utilities
import os
import gdown
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Configuration
RETRAIN_MODEL = False  # Set to True to train a new model, False to load an existing one
model_file = "model/PrimaryDiseaseDetectorModel.keras"

# Function to download files from Google Drive
def download_from_google_drive(url, output_path):
    file_id = url.split('/d/')[1].split('/')[0]
    gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

# Google Drive URLs
tcga_dataset_log2_url = "https://drive.google.com/file/d/1-6OA1Q0TqFeooVHmURcZ_F9YjRh9D2cK/view?usp=drive_link"
met500_dataset_log2_url = "https://drive.google.com/file/d/1nBzGFuq-ExWw0KC0dtagJqAOFjji8bQc/view?usp=drive_link"
phenotype_tcga_url = "https://drive.google.com/file/d/1wNXgjZMQUDqNosG_q8qZNIIq0za-ghF0/view?usp=drive_link"
phenotype_met500_url = "https://drive.google.com/file/d/1-7yVlLwIo2aD_eojIysUllnRXb3j-b7e/view?usp=drive_link"

# Create directories
os.makedirs("data", exist_ok=True)
os.makedirs("model", exist_ok=True)

# Load or download and process data depending on RETRAIN_MODEL
if RETRAIN_MODEL:
    print("Downloading TCGA data...")
    download_from_google_drive(tcga_dataset_log2_url, "data/tcga_gene_expression_log2_common_genes.csv")

    print("Downloading MET500 data...")
    download_from_google_drive(met500_dataset_log2_url, "data/met500_gene_expression_common_genes.csv")

    print("Downloading TCGA phenotypes...")
    download_from_google_drive(phenotype_tcga_url, "data/TCGA_phenotype_denseDataOnlyDownload.tsv.gz")

    print("Downloading MET500 phenotypes...")
    download_from_google_drive(phenotype_met500_url, "data/MET500_metadata.txt")

    # Load datasets
    tcga_df_log2 = pd.read_csv("data/tcga_gene_expression_log2_common_genes.csv", index_col=0)
    met500_df = pd.read_csv("data/met500_gene_expression_common_genes.csv", index_col=0)
    phenotype_tcga = pd.read_csv("data/TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep="\t").set_index("sample")
    phenotype_met500 = pd.read_csv("data/MET500_metadata.txt", sep="\t").set_index("Sample_id")

    # Verify dataset dimensions
    print(f"TCGA dimensions: {tcga_df_log2.shape}")
    print(f"MET500 dimensions: {met500_df.shape}")
    print(f"TCGA phenotypes dimensions: {phenotype_tcga.shape}")
    print(f"MET500 phenotypes dimensions: {phenotype_met500.shape}")

    # Normalization and data preprocessing
    scaler = MinMaxScaler()
    tcga_scaled = scaler.fit_transform(tcga_df_log2.T)
    met500_scaled = scaler.transform(met500_df.T)

    # Convert data into image format
    num_genes = tcga_scaled.shape[1]
    image_size = int(np.ceil(np.sqrt(num_genes)))
    padding = image_size**2 - num_genes

    tcga_images = np.array([
        np.pad(sample, (0, padding), mode='constant').reshape(image_size, image_size)
        for sample in tcga_scaled
    ])
    met500_images = np.array([
        np.pad(sample, (0, padding), mode='constant').reshape(image_size, image_size)
        for sample in met500_scaled
    ])

    tcga_images = tcga_images[..., np.newaxis]
    met500_images = met500_images[..., np.newaxis]

    # Generate dummy labels for training
    labels_tcga = np.random.randint(0, 2, tcga_images.shape[0])  # Replace with actual labels
    labels_met500 = np.random.randint(0, 2, met500_images.shape[0])  # Replace with actual labels

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(tcga_images, labels_tcga, test_size=0.2, random_state=42)
else:
    print("Loading preprocessed data for evaluation...")
    # Assume preprocessed data is stored as arrays or DataFrames
    # These would match the results from preprocessing with RETRAIN_MODEL=True
    # Placeholder examples:
    image_size = 224  # Adjust this value based on your image size
    met500_images = np.random.rand(100, image_size, image_size, 1)  # Placeholder for test data
    labels_met500 = np.random.randint(0, 2, 100)  # Placeholder for test labels

# Train or load the model
if RETRAIN_MODEL:
    # Build the model
    input_layer = Input(shape=(image_size, image_size, 1))
    conv1 = Conv2D(32, (3, 3), activation='relu', strides=(5, 5))(input_layer)
    flatten = Flatten()(conv1)
    dropout = Dropout(0.5)(flatten)
    output_layer = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-8)

    # Train the model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    # Save the trained model
    model.save(model_file)
    print(f"Model saved to: {model_file}")
else:
    # Load the existing model
    model = load_model(model_file)
    print(f"Model loaded from: {model_file}")

# Evaluate on MET500
y_pred = (model.predict(met500_images) > 0.5).astype(int)

# Results report
accuracy = accuracy_score(labels_met500, y_pred)
print(f"\nAccuracy on MET500: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(labels_met500, y_pred))

# Confusion matrix
conf_matrix = confusion_matrix(labels_met500, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()

## Preprocessing data
Run this cell just if you want to preprocess data from the raw datasets

In [None]:
# Preprocessing Data

# Importing necessary libraries
import os
import requests
import gzip
import pandas as pd
from io import BytesIO

# URLs of the datasets
tcga_url = "https://toil-xena-hub.s3.us-east-1.amazonaws.com/download/tcga_RSEM_gene_fpkm.gz"
met500_url = "https://ucsc-public-main-xena-hub.s3.us-east-1.amazonaws.com/download/MET500%2FgeneExpression%2FM.mx.log2.txt.gz"

# File paths for the processed datasets
tcga_file_path = "data/tcga_gene_expression_log2_common_genes.csv"
met500_file_path = "data/met500_gene_expression_common_genes.csv"

# Ensure the data directory exists
os.makedirs("data", exist_ok=True)

# Function to download and load the compressed file
def download_and_load_gzip(url):
    response = requests.get(url, stream=True)
    with gzip.open(BytesIO(response.content), 'rt') as f:
        df = pd.read_csv(f, sep='\t', index_col=0)
    return df

# Check if files already exist
if os.path.exists(tcga_file_path) and os.path.exists(met500_file_path):
    print(f"Processed files already exist:")
    print(f"- TCGA: {tcga_file_path}")
    print(f"- MET500: {met500_file_path}")
    print("\nIf you want to preprocess the data again, delete the existing files and re-run this cell.")
else:
    # Process the TCGA dataset
    print("Downloading and processing TCGA data...")
    tcga_df = download_and_load_gzip(tcga_url)

    # Process the MET500 dataset
    print("Downloading and processing MET500 data...")
    met500_df = download_and_load_gzip(met500_url)

    # Intersect common genes between TCGA and MET500
    common_genes = tcga_df.index.intersection(met500_df.index)

    # Filter both datasets for common genes
    tcga_df_log2 = tcga_df.loc[common_genes]
    met500_df_log2 = met500_df.loc[common_genes]

    # Check dimensions after filtering
    print(f"Number of common genes: {len(common_genes)}")
    print(f"Dimensions of TCGA dataset after filtering: {tcga_df_log2.shape}")
    print(f"Dimensions of MET500 dataset after filtering: {met500_df_log2.shape}")

    # Save the processed datasets to local files
    tcga_df_log2.to_csv(tcga_file_path)
    met500_df_log2.to_csv(met500_file_path)

    print(f"Processed TCGA dataset saved to: {tcga_file_path}")
    print(f"Processed MET500 dataset saved to: {met500_file_path}")