In [2]:
from typing import *
import matplotlib
import matplotlib.pyplot as plt
from IPython.display import Image, display, clear_output
import numpy as np
import seaborn as sns
import pandas as pd
sns.set_style("whitegrid")
import requests

import math
import torch
from torch import nn, Tensor
from torch.nn.functional import softplus
from torch.distributions import Distribution


from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor
from functools import reduce
from torch.distributions.bernoulli import Bernoulli

import gzip
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Helper functions

In [3]:
def load_data_chunk(filename, chunk_size=1000):
    """ Load a chunk of data from a gzipped TSV file. """
    return pd.read_csv(filename, sep='\t', compression='gzip', chunksize=chunk_size)

def separate_ids_and_data(data):
    ids = data.iloc[:, 0]
    data = data.iloc[:, 1:]
    return ids, data

class ReparameterizedDiagonalGaussian(Distribution):
    """
    A distribution `N(y | mu, sigma I)` compatible with the reparameterization trick given `epsilon ~ N(0, 1)`.
    """
    def __init__(self, mu: Tensor, log_sigma:Tensor):
        assert mu.shape == log_sigma.shape, f"Tensors `mu` : {mu.shape} and ` log_sigma` : {log_sigma.shape} must be of the same shape"
        self.mu = mu
        self.sigma = log_sigma.exp()

    def sample_epsilon(self) -> Tensor:
        """`\eps ~ N(0, I)`"""
        return torch.empty_like(self.mu).normal_()

    def sample(self) -> Tensor:
        """sample `z ~ N(z | mu, sigma)` (without gradients)"""
        with torch.no_grad():
            return self.rsample()

    def rsample(self) -> Tensor:
        """sample `z ~ N(z | mu, sigma)` (with the reparameterization trick) """
        # z = mu + sigma * epsilon
        return self.mu + self.sigma * self.sample_epsilon()

    def log_prob(self, z: Tensor) -> Tensor:
        """return the log probability: log `p(z)`"""
        # Log probability for Gaussian distribution
        # log p(z) = -1/2 * [log(2*pi) + 2*log(sigma) + (z - mu)^2/sigma^2]
        return -0.5 * (torch.log(2 * torch.tensor(math.pi)) + 2 * torch.log(self.sigma) +
                       torch.pow(z - self.mu, 2) / torch.pow(self.sigma, 2))
    
    def count_csv_rows(filename):
        # If the file is gzip-compressed, decompress it first
        if filename.endswith('.gz'):
            with gzip.open(filename, 'rt', newline='') as csvfile:
                row_count = sum(1 for row in csvfile)
        else:
            # Specify the correct encoding (e.g., 'utf-8', 'latin-1', etc.)
            encoding = 'utf-8'  # Change to the appropriate encoding if needed
            with open(filename, 'r', newline='', encoding=encoding) as csvfile:
                row_count = sum(1 for row in csvfile)
        return row_count

## Load Data

In [4]:
# Define the file paths
archs4_path = "/dtu-compute/datasets/iso_02456/archs4_gene_expression_norm_transposed.tsv.gz"
gtex_gene_path = "/dtu-compute/datasets/iso_02456/gtex_gene_expression_norm_transposed.tsv.gz"
gtex_isoform_path = "/dtu-compute/datasets/iso_02456/gtex_isoform_expression_norm_transposed.tsv.gz"
gtex_anno_path = "/dtu-compute/datasets/iso_02456/gtex_gene_isoform_annoation.tsv.gz"
gtex_tissue_path = "/dtu-compute/datasets/iso_02456/gtex_annot.tsv.gz"

num_genes = 18965
num_isoforms = 156958

num_genes, num_isoforms

(18965, 156958)

In [5]:
# count_csv_rows(gtex_gene_path)
print("gtex_gene_path num rows: 17357")
percentage_calc = 17357 * 0.0005
percentage_calc

gtex_gene_path num rows: 17357


8.6785

Using percentage 0.001 should load 17 samples, and 0.01 170. We use 0.005 for 86 samples

### Data Loader

- Label Encoding: This converts each unique label into a unique integer. If you have a large number of classes, be aware of memory usage.
- Inverse Transformation: If you need to get back the original labels from the encoded ones, you can use self.label_encoder.inverse_transform().
- Data Types: Labels are converted to torch.long since they are now integers, which is a common practice for categorical labels in classification tasks.

In [6]:
class GeneExpressionDataset(Dataset):
    """ # Old dataLoader with no TestSet
    def __init__(self, filepath, percentage=0.1):
        self.data = self.load_data_percentage(filepath, percentage)
        # Assuming the first column is the label
        self.labels = self.data.iloc[:, 0]
        self.genes = self.data.iloc[:, 1:]

        # Encode the labels
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)
    """

    def __init__(self, data, labels):
        self.labels = labels
        self.genes = data
        self.label_encoder = LabelEncoder()
        self.encoded_labels = self.label_encoder.fit_transform(self.labels)

    def __len__(self):
        return len(self.genes)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        label = self.encoded_labels[idx]
        genes = self.genes.iloc[idx]
        return torch.tensor(genes.values, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

    def get_original_labels(self, encoded_labels):
        return self.label_encoder.inverse_transform(encoded_labels)

    # Next function is slow because it counts all the rows to calculate the percentage, that is why we added load rows
    @staticmethod
    def load_data_percentage(filepath, percentage=0.1):
        # Load a percentage of the data as shown previously
        cols = pd.read_csv(filepath, sep='\t', compression='gzip', nrows=0).columns
        n_total_rows = sum(1 for row in open(filepath, 'rb'))
        n_rows_to_load = int(n_total_rows * percentage)
        skip_rows = np.random.choice(np.arange(1, n_total_rows), size=n_total_rows - n_rows_to_load, replace=False)
        return pd.read_csv(filepath, sep='\t', compression='gzip', usecols=cols, skiprows=skip_rows)
    
    @staticmethod
    def load_rows(filepath, num_rows):
        """
        Load a specific number of rows from the file.
        """
        return pd.read_csv(filepath, sep='\t', compression='gzip', nrows=num_rows)

In [7]:
num_rows_to_load = 100  # specify the number of rows you want to load

# Print progress
print("Loading data...")

# Load the entire dataset
full_data = GeneExpressionDataset.load_rows(gtex_gene_path, num_rows=num_rows_to_load)
# Print progress
print("Data loaded.")

# Split the data into training and testing sets
print("Splitting data into training and testing sets...")
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# Print progress
print("Data split.")

# Create dataset instances for training and testing
print("Creating training and testing datasets...")
gene_train_dataset = GeneExpressionDataset(train_data.iloc[:, 1:], train_data.iloc[:, 0])
gene_test_dataset = GeneExpressionDataset(test_data.iloc[:, 1:], test_data.iloc[:, 0])

# Print progress
print("Datasets created.")

gene_train_loader = DataLoader(gene_train_dataset, batch_size=64, shuffle=True)
gene_test_loader = DataLoader(gene_test_dataset, batch_size=64, shuffle=False)  # Usually, shuffling is not needed for testing

# Print progress
print("Data loaders created.")

Loading data...
Data loaded.
Splitting data into training and testing sets...
Data split.
Creating training and testing datasets...
Datasets created.
Data loaders created.


In [None]:
# Same code with percentage and without showing progress 
"""
load_percentage = 0.0005

# Load the entire dataset
full_data = GeneExpressionDataset.load_data_percentage(gtex_gene_path, percentage=load_percentage)

# Split the data into training and testing sets
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# Create dataset instances for training and testing
gene_train_dataset = GeneExpressionDataset(train_data.iloc[:, 1:], train_data.iloc[:, 0])
gene_test_dataset = GeneExpressionDataset(test_data.iloc[:, 1:], test_data.iloc[:, 0])

gene_train_loader = DataLoader(gene_train_dataset, batch_size=64, shuffle=True)
gene_test_loader = DataLoader(gene_test_dataset, batch_size=64, shuffle=False)  # Usually, shuffling is not needed for testing
"""


In [8]:
full_data

Unnamed: 0,sample_id,TMEM38B,SLC24A3,AXDND1,DUXA,ZCCHC13,FGF18,INPP5D,MAP2K4,BCAR1,...,MARCH10,EVL,CYP1A2,ZNF782,LIMCH1,WDR24,ANGPTL4,UGT2B7,PIPOX,CD1B
0,GTEX-1117F-0226-SM-5GZZ7,3.327687,2.339137,0.084064,0.000000,0.0,0.765535,4.675816,3.768714,5.701826,...,0.545968,6.146289,0.084064,1.778209,4.233428,3.715893,7.896635,0.214125,2.130931,0.000000
1,GTEX-1117F-0426-SM-5EGHI,4.295723,4.206331,0.000000,0.070389,0.0,0.250962,2.765535,3.214125,5.223423,...,0.378512,4.870365,0.042644,0.669027,4.821200,3.181103,2.931683,0.137504,0.933573,0.000000
2,GTEX-1117F-0526-SM-5EGHJ,3.860963,2.350497,0.189034,0.000000,0.0,0.871844,4.097611,3.648465,5.529196,...,0.985500,5.103498,0.028569,1.516015,4.176323,3.478972,6.069745,0.056584,2.049631,0.000000
3,GTEX-1117F-0626-SM-5N9CS,2.957915,2.935460,0.000000,0.084064,0.0,2.049631,4.121015,3.705978,5.959538,...,0.526069,6.604071,0.056584,2.077243,4.639811,3.238787,7.274634,0.097611,1.035624,0.000000
4,GTEX-1117F-0726-SM-5GIEN,2.555816,0.505891,0.056584,0.000000,0.0,0.443607,3.841973,2.541019,4.856488,...,0.321928,6.026800,0.000000,0.739848,4.388878,2.111031,5.911931,0.000000,0.321928,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,GTEX-1122O-1026-SM-9WYSF,2.100978,2.817623,0.111031,0.000000,0.0,0.176323,3.089159,4.010780,2.347666,...,0.111031,5.928370,0.286881,1.580145,2.929791,3.447579,3.478972,0.000000,0.918386,0.176323
96,GTEX-1122O-1126-SM-5NQ8X,2.211012,4.875780,0.176323,0.903038,0.0,0.807355,1.981853,3.671293,6.242031,...,0.028569,5.590362,0.000000,1.250962,6.229972,2.807355,4.778734,0.000000,0.933573,0.000000
97,GTEX-1122O-1226-SM-5H113,2.313246,2.611172,0.411426,0.056584,0.0,0.970854,3.742006,3.605257,4.925050,...,0.000000,6.585413,0.056584,1.438293,4.397803,3.061776,7.072320,0.286881,0.555816,0.084064
98,GTEX-1122O-1326-SM-5H11F,2.229588,1.835924,0.790772,0.000000,0.0,0.495695,3.462052,3.010780,6.156235,...,0.014355,5.779785,0.000000,1.541019,2.121015,2.761285,3.572890,3.775051,0.903038,0.000000


In [9]:
genes, labels = next(iter(gene_train_loader))
genes, labels

(tensor([[1.8156, 2.4673, 0.4222,  ..., 0.6508, 0.5261, 0.0000],
         [3.4581, 1.9449, 0.7824,  ..., 0.0426, 0.9411, 0.0000],
         [3.0126, 0.7991, 0.3334,  ..., 0.0000, 0.3561, 0.0000],
         ...,
         [2.9203, 5.9378, 0.0000,  ..., 0.1763, 1.0841, 0.0000],
         [4.7479, 1.3103, 5.1010,  ..., 0.1890, 1.6508, 0.0000],
         [2.1276, 0.5558, 0.0000,  ..., 0.0000, 0.1243, 0.0000]]),
 tensor([33, 52, 28, 26, 14, 11, 17, 72,  6, 48, 66, 19, 30, 51, 71,  0, 74, 59,
         57,  2, 69, 41, 31, 42, 64, 62, 12, 49, 55, 32,  3, 25, 10,  5, 60, 16,
         15, 45, 27, 40, 61, 67, 43, 63,  9, 56, 29, 77, 39,  8, 34, 78, 79, 23,
         53, 58, 37, 18, 54, 46, 75,  4, 21, 36]))

In [10]:
test_genes, test_labels = next(iter(gene_test_loader))
test_genes, test_labels

(tensor([[3.3060, 0.2141, 0.0976,  ..., 0.3674, 0.9260, 0.0000],
         [2.4033, 3.5981, 0.1110,  ..., 0.1243, 1.8679, 0.0000],
         [1.0426, 0.2265, 0.2630,  ..., 1.3896, 0.3103, 0.0000],
         ...,
         [2.5945, 2.5534, 0.4436,  ..., 0.5059, 0.7312, 0.1890],
         [2.5509, 1.1890, 0.3785,  ..., 0.0286, 0.6508, 0.0000],
         [1.5656, 2.7887, 0.2750,  ..., 0.0286, 0.7908, 0.2869]]),
 tensor([18, 12, 13, 11, 10,  9,  5, 17,  2,  0,  4,  6, 14,  8, 19,  1, 15, 16,
          3,  7]))

Get original labels

In [11]:
# Convert encoded labels back to original string labels
original_labels = gene_train_dataset.get_original_labels(labels.numpy())
original_labels

array(['GTEX-111FC-2526-SM-5GZXU', 'GTEX-111YS-0726-SM-5GZY8',
       'GTEX-111FC-0826-SM-5GZWO', 'GTEX-111FC-0526-SM-5GZZ8',
       'GTEX-111CU-0826-SM-5EGIJ', 'GTEX-111CU-0426-SM-5GZY1',
       'GTEX-111CU-1226-SM-5EGIN', 'GTEX-1122O-0726-SM-5GIEV',
       'GTEX-1117F-2826-SM-5GZXL', 'GTEX-111YS-0326-SM-5GZZ3',
       'GTEX-1122O-0008-SM-5QGR2', 'GTEX-111CU-1426-SM-5GZYP',
       'GTEX-111FC-1326-SM-5N9D9', 'GTEX-111YS-0626-SM-5GZXV',
       'GTEX-1122O-0626-SM-5N9B9', 'GTEX-1117F-0426-SM-5EGHI',
       'GTEX-1122O-0926-SM-9YFLC', 'GTEX-111YS-1626-SM-5GZZ9',
       'GTEX-111YS-1326-SM-5EGGK', 'GTEX-1117F-0626-SM-5N9CS',
       'GTEX-1122O-0326-SM-5H124', 'GTEX-111VG-2326-SM-5N9BK',
       'GTEX-111FC-1926-SM-5GZYC', 'GTEX-111VG-2426-SM-5GZXD',
       'GTEX-111YS-2426-SM-5GZZQ', 'GTEX-111YS-2126-SM-5EGGM',
       'GTEX-111CU-0526-SM-5EGHK', 'GTEX-111YS-0426-SM-5987O',
       'GTEX-111YS-1026-SM-5GZYE', 'GTEX-111FC-2026-SM-5GZYO',
       'GTEX-1117F-1326-SM-5EGHH', 'GTEX-111FC-0426-SM-

# Todo: Add code to load VAE or latent features outputed from the VAE for gtex_gene_path

## Building the model
When defining the model the latent layer must act as a bottleneck of information, so that we ensure that we find a strong internal representation. We initialize the VAE with 1 hidden layer in the encoder and decoder using relu units as non-linearity.

## Training and Evaluation

### Initialize the model, evaluator and optimizer

In [12]:
print(torch.__version__)
print(torch.version.cuda)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f">> Using device: {device}")

1.10.2+cu102
10.2
>> Using device: cpu


In [13]:
train_loader = gene_train_loader
test_loader = gene_test_loader

## Evaluate model