In [None]:
# PrimaryDiseaseDetector.ipynb

# Importación de librerías necesarias

# Utilidades estándar
import os
import gdown
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Conv2D, Flatten, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Configuración
RETRAIN_MODEL = False
model_file = "model/PrimaryDiseaseDetectorModel.keras"

# Función para descargar archivos desde Google Drive
def download_from_google_drive(url, output_path):
    file_id = url.split('/d/')[1].split('/')[0]
    gdown.download(f"https://drive.google.com/uc?id={file_id}", output_path, quiet=False)

# URLs de Google Drive
tcga_dataset_log2_url = "https://drive.google.com/file/d/1-6OA1Q0TqFeooVHmURcZ_F9YjRh9D2cK/view?usp=drive_link"
met500_dataset_log2_url = "https://drive.google.com/file/d/1nBzGFuq-ExWw0KC0dtagJqAOFjji8bQc/view?usp=drive_link"
phenotype_tcga_url = "https://drive.google.com/file/d/1wNXgjZMQUDqNosG_q8qZNIIq0za-ghF0/view?usp=drive_link"
phenotype_met500_url = "https://drive.google.com/file/d/1-7yVlLwIo2aD_eojIysUllnRXb3j-b7e/view?usp=drive_link"

# Directorios
os.makedirs("data", exist_ok=True)
os.makedirs("model", exist_ok=True)

# Cargar o descargar y procesar datos según RETRAIN_MODEL
if RETRAIN_MODEL:
    print("Descargando datos de TCGA...")
    download_from_google_drive(tcga_dataset_log2_url, "data/tcga_gene_expression_log2_common_genes.csv")

    print("Descargando datos de MET500...")
    download_from_google_drive(met500_dataset_log2_url, "data/met500_gene_expression_common_genes.csv")

    print("Descargando fenotipos de TCGA...")
    download_from_google_drive(phenotype_tcga_url, "data/TCGA_phenotype_denseDataOnlyDownload.tsv.gz")

    print("Descargando fenotipos de MET500...")
    download_from_google_drive(phenotype_met500_url, "data/MET500_metadata.txt")

    # Cargar los datasets
    tcga_df_log2 = pd.read_csv("data/tcga_gene_expression_log2_common_genes.csv", index_col=0)
    met500_df = pd.read_csv("data/met500_gene_expression_common_genes.csv", index_col=0)
    phenotype_tcga = pd.read_csv("data/TCGA_phenotype_denseDataOnlyDownload.tsv.gz", sep="\t").set_index("sample")
    phenotype_met500 = pd.read_csv("data/MET500_metadata.txt", sep="\t").set_index("Sample_id")

    # Verificar las dimensiones de los datos
    print(f"Dimensiones de TCGA: {tcga_df_log2.shape}")
    print(f"Dimensiones de MET500: {met500_df.shape}")
    print(f"Dimensiones del fenotipo TCGA: {phenotype_tcga.shape}")
    print(f"Dimensiones del fenotipo MET500: {phenotype_met500.shape}")

    # Normalización y preprocesamiento de datos
    scaler = MinMaxScaler()
    tcga_scaled = scaler.fit_transform(tcga_df_log2.T)
    met500_scaled = scaler.transform(met500_df.T)

    # Convertir a formato imagen
    num_genes = tcga_scaled.shape[1]
    image_size = int(np.ceil(np.sqrt(num_genes)))
    padding = image_size**2 - num_genes

    tcga_images = np.array([
        np.pad(sample, (0, padding), mode='constant').reshape(image_size, image_size)
        for sample in tcga_scaled
    ])
    met500_images = np.array([
        np.pad(sample, (0, padding), mode='constant').reshape(image_size, image_size)
        for sample in met500_scaled
    ])

    tcga_images = tcga_images[..., np.newaxis]
    met500_images = met500_images[..., np.newaxis]

    # Etiquetas ficticias para entrenamiento
    labels_tcga = np.random.randint(0, 2, tcga_images.shape[0])  # Cambiar por etiquetas reales
    labels_met500 = np.random.randint(0, 2, met500_images.shape[0])  # Cambiar por etiquetas reales

    # Dividir en entrenamiento y validación
    X_train, X_val, y_train, y_val = train_test_split(tcga_images, labels_tcga, test_size=0.2, random_state=42)
else:
    print("Cargando datos preprocesados para evaluación...")
    # Supongamos que ya se tienen datos preprocesados guardados en arrays o DataFrames
    # Estos serían los mismos resultados de haber preprocesado con RETRAIN_MODEL=True
    # Placeholder de ejemplos:
    image_size = 224  # Cambia este valor según el tamaño de tus imágenes
    met500_images = np.random.rand(100, image_size, image_size, 1)  # Placeholder de datos de prueba
    labels_met500 = np.random.randint(0, 2, 100)  # Placeholder de etiquetas de prueba

# Entrenar el modelo o cargarlo
if RETRAIN_MODEL:
    # Construcción del modelo
    input_layer = Input(shape=(image_size, image_size, 1))
    conv1 = Conv2D(32, (3, 3), activation='relu', strides=(5, 5))(input_layer)
    flatten = Flatten()(conv1)
    dropout = Dropout(0.5)(flatten)
    output_layer = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-8)

    # Entrenamiento
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=32,
        callbacks=[early_stopping, reduce_lr],
        verbose=1
    )

    # Guardar el modelo entrenado
    model.save(model_file)
    print(f"Modelo guardado en: {model_file}")
else:
    # Cargar el modelo existente
    model = load_model(model_file)
    print(f"Modelo cargado desde: {model_file}")

# Evaluación en MET500
y_pred = (model.predict(met500_images) > 0.5).astype(int)

# Reporte de resultados
accuracy = accuracy_score(labels_met500, y_pred)
print(f"\nAccuracy on MET500: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(labels_met500, y_pred))

# Matriz de confusión
conf_matrix = confusion_matrix(labels_met500, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()