In [5]:
%pip install scanpy

Collecting scanpy
  Downloading scanpy-1.10.4-py3-none-any.whl.metadata (9.3 kB)
Collecting anndata>=0.8 (from scanpy)
  Downloading anndata-0.11.1-py3-none-any.whl.metadata (8.2 kB)
Collecting legacy-api-wrap>=1.4 (from scanpy)
  Downloading legacy_api_wrap-1.4.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pynndescent>=0.5 (from scanpy)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting session-info (from scanpy)
  Downloading session_info-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting umap-learn!=0.5.0,>=0.5 (from scanpy)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting array-api-compat!=1.5,>1.4 (from anndata>=0.8->scanpy)
  Downloading array_api_compat-1.9.1-py3-none-any.whl.metadata (1.6 kB)
Collecting stdlib_list (from session-info->scanpy)
  Downloading stdlib_list-0.11.0-py3-none-any.whl.metadata (3.3 kB)
Downloading scanpy-1.10.4-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━

In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [62]:
root_dir = '/content/drive/MyDrive/CS7643/DL_project/'

In [63]:
!cp '/content/drive/MyDrive/CS7643/DL_project/mehak/DL_Project/model.py' .

cp: cannot stat 'model_path': No such file or directory


In [6]:
import numpy as np
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
import scanpy as sc
from model import *
import torch.optim as optim
from sklearn.metrics import mean_squared_error
from skimage.metrics import structural_similarity as ssim
import warnings
warnings.filterwarnings("ignore")

In [12]:
# data_dir = 'spatial_datasets/GSE213264_RAW/'
# results_dir = 'results_baseline_v2/'

# for colab
data_dir = root_dir + 'data/spatial_datasets/GSE213264_RAW/'
results_dir = root_dir + 'mehak/DL_Project/results_neigh_loss/hyperparam_tuning/'

In [56]:
# Hyperparameters to tune
n_neighbors = 20
lr = 1e-3
latent_dim = 32
n_epochs = 100
N = 2000

In [57]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

tissues= ['humanGBM']#, 'humanskin', 'humanthymus', 'humanspleen', 'humantonsil', 'mousekidney', 'mouseintestine', 'mousecolon', 'mousespleen']

for tissue in tissues:
    rna_data = None
    protein_data = None

    for filename in os.listdir(data_dir):
        file_path = os.path.join(data_dir, filename)
        if tissue in filename and filename.endswith("RNA.tsv.gz"):
            rna_data = pd.read_csv(file_path, sep="\t")
        elif tissue in filename and filename.endswith("protein.tsv.gz"):
            protein_data = pd.read_csv(file_path, sep="\t")

    rna_data.columns = rna_data.columns.astype(str)
    protein_data.columns = protein_data.columns.astype(str)

    rna_data = rna_data.sort_values(by='X')
    protein_data = protein_data.sort_values(by='X')

    rna_data = rna_data.reset_index(drop=True)
    protein_data = protein_data.reset_index(drop=True)
    rna_data.index = rna_data.index.astype(str)
    protein_data.index = protein_data.index.astype(str)

    rna_data[['X', 'Y']] = rna_data['X'].str.split('x', expand=True)
    rna_data['X'] = pd.to_numeric(rna_data['X'], errors='coerce')
    rna_data['Y'] = pd.to_numeric(rna_data['Y'], errors='coerce')
    spatial = rna_data[['X', 'Y']].copy()
    rna_data.drop(['X', 'Y'], axis=1, inplace=True)
    protein_data.drop(['X'], axis=1, inplace=True)

    rna_train, rna_test = train_test_split(rna_data, test_size=0.2, random_state=42)
    protein_train = protein_data.loc[rna_train.index]
    protein_test = protein_data.loc[rna_test.index]

    adata_rna_train = sc.AnnData(rna_train)
    sc.pp.normalize_total(adata_rna_train, target_sum=1e4)
    sc.pp.log1p(adata_rna_train)
    sc.pp.highly_variable_genes(adata_rna_train, n_top_genes=N, flavor='seurat', subset=True)
    counts_norm = adata_rna_train.X
    rna_counts_norm = torch.FloatTensor(counts_norm).to(device)

    adata_protein_train = sc.AnnData(protein_train)
    sc.pp.normalize_total(adata_protein_train, target_sum=1e4)
    sc.pp.log1p(adata_protein_train)
    counts_norm = adata_protein_train.X
    protein_counts_norm = torch.FloatTensor(counts_norm).to(device)

    spatial_train = torch.tensor(spatial.loc[rna_train.index].values, dtype=torch.float32).to(device)
    combined_data = torch.cat([rna_counts_norm, protein_counts_norm], dim=1).to(device)

    distances = torch.cdist(spatial_train, spatial_train, p=2)
    number_neighbors = n_neighbors
    closest_neighbors = {}
    furthest_neighbors = {}

    for i in range(distances.shape[0]):
        sorted_indices = torch.argsort(distances[i])
        closest_neighbors[i] = sorted_indices[1:number_neighbors+1].cpu().numpy()
        furthest_neighbors[i] = sorted_indices[-number_neighbors:].cpu().numpy()

    input_dim = combined_data.shape[1]
    latent_dim = latent_dim
    model = VAE(input_dim, latent_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    num_epochs = n_epochs

    latent_means_all = [None] * combined_data.size(0)

    for epoch in range(num_epochs):
        model.train()
        reconstructed_data, mean, logvar = model(combined_data)

        loss = vae_loss2(reconstructed_data, combined_data, mean, logvar, closest_neighbors, furthest_neighbors,lambda_kl=0.0001, lambda_nl=1)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss / len(combined_data):.4f}')


    model.eval()
    with torch.no_grad():
        combined_data = torch.cat([rna_counts_norm, torch.zeros(rna_counts_norm.shape[0], protein_counts_norm.shape[1]).to(device)], dim=1)
        reconstructed_data, mean, logvar = model(combined_data)
        reconstructed_protein_counts = reconstructed_data[:, rna_counts_norm.shape[1]:]

        rmse = np.sqrt(mean_squared_error(protein_counts_norm.cpu().numpy(), reconstructed_protein_counts.cpu().numpy()))
        pcc = pd.DataFrame(protein_counts_norm.cpu().numpy()).corrwith(pd.DataFrame(reconstructed_protein_counts.cpu().numpy()), axis=1, method='pearson')
        avg_corr_pearson = pcc.mean()
        ssim_val = ssim(protein_counts_norm.cpu().numpy(), reconstructed_protein_counts.cpu().numpy(), data_range=reconstructed_protein_counts.cpu().numpy().max() - reconstructed_protein_counts.cpu().numpy().min())

        results_df = pd.DataFrame({
            'RMSE': [rmse],
            'Pearson Correlation': [avg_corr_pearson],
            'SSIM':ssim_val
        })

        results_file_path = os.path.join(results_dir, f"{tissue}_training_results.csv")
        results_df.to_csv(results_file_path, index=False)

        adata_rna_test = sc.AnnData(rna_test)
        sc.pp.normalize_total(adata_rna_test, target_sum=1e4)
        sc.pp.log1p(adata_rna_test)
        counts_norm = adata_rna_test[:,  adata_rna_train.var_names].X
        rna_counts_norm = torch.FloatTensor(counts_norm).to(device)

        adata_protein_test = sc.AnnData(protein_test)
        sc.pp.normalize_total(adata_protein_test, target_sum=1e4)
        sc.pp.log1p(adata_protein_test)
        counts_norm = adata_protein_test.X
        protein_counts_norm = torch.FloatTensor(counts_norm).to(device)

        combined_data = torch.cat([rna_counts_norm, torch.zeros(rna_counts_norm.shape[0], protein_counts_norm.shape[1]).to(device)], dim=1)
        reconstructed_data, mean, logvar = model(combined_data)
        reconstructed_protein_counts = reconstructed_data[:, rna_counts_norm.shape[1]:]

        rmse = np.sqrt(mean_squared_error(protein_counts_norm.cpu().numpy(), reconstructed_protein_counts.cpu().numpy()))
        pcc = pd.DataFrame(protein_counts_norm.cpu().numpy()).corrwith(pd.DataFrame(reconstructed_protein_counts.cpu().numpy()), axis=1, method='pearson')
        avg_corr_pearson = pcc.mean()
        ssim_val = ssim(protein_counts_norm.cpu().numpy(), reconstructed_protein_counts.cpu().numpy(), data_range=reconstructed_protein_counts.cpu().numpy().max() - reconstructed_protein_counts.cpu().numpy().min())

        results_df = pd.DataFrame({
            'RMSE': [rmse],
            'Pearson Correlation': [avg_corr_pearson],
            'SSIM':ssim_val
        })

    # results_file_path = os.path.join(results_dir, f"{tissue}_results.csv")
    # results_df.to_csv(results_file_path, index=False)

    print('\n', results_df)

    print(f"Processed {tissue} successfully.")

Using device: cuda
Epoch [1/100], Loss: 4139.8960
Epoch [2/100], Loss: 3562.3538
Epoch [3/100], Loss: 3160.1560
Epoch [4/100], Loss: 2802.2759
Epoch [5/100], Loss: 2481.0256
Epoch [6/100], Loss: 2223.3176
Epoch [7/100], Loss: 2021.9985
Epoch [8/100], Loss: 1848.4301
Epoch [9/100], Loss: 1713.8485
Epoch [10/100], Loss: 1598.6970
Epoch [11/100], Loss: 1503.3260
Epoch [12/100], Loss: 1420.1234
Epoch [13/100], Loss: 1349.6205
Epoch [14/100], Loss: 1286.7097
Epoch [15/100], Loss: 1227.3407
Epoch [16/100], Loss: 1168.7416
Epoch [17/100], Loss: 1122.8019
Epoch [18/100], Loss: 1070.4974
Epoch [19/100], Loss: 1027.8147
Epoch [20/100], Loss: 979.5984
Epoch [21/100], Loss: 944.5267
Epoch [22/100], Loss: 901.7663
Epoch [23/100], Loss: 872.5259
Epoch [24/100], Loss: 842.3904
Epoch [25/100], Loss: 815.0005
Epoch [26/100], Loss: 789.3470
Epoch [27/100], Loss: 768.2477
Epoch [28/100], Loss: 751.8855
Epoch [29/100], Loss: 733.0735
Epoch [30/100], Loss: 721.8217
Epoch [31/100], Loss: 708.4349
Epoch [32/

In [55]:
print(results_df)

       RMSE  Pearson Correlation      SSIM
0  1.068244             0.868592  0.648809


In [64]:
results_file_path = os.path.join(results_dir, f"{tissue}_results.csv")
results_df.to_csv(results_file_path, index=False)

In [None]:
RMSE  Pearson Correlation      SSIM
0  1.068244             0.868592  0.648809