In [1]:
import os
import gc
import torch
import numpy as np
import pandas as pd
import random 

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from timm.models.vision_transformer import vit_base_patch16_224

from scipy.stats import uniform, randint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score

from models.simple_cnn import SimpleCNN
from models.cnn_v2 import CNNV2
from helpers.helpers import SequenceDataset
from helpers.plots import plot_losses, plot_predictions_vs_labels, plot_predictions_vs_labels_by_species, plot_boxplot_predictions_vs_labels
from helpers.early_stopping import EarlyStopping

from torch.utils.data import DataLoader
from ray import tune
from ray.tune.schedulers import ASHAScheduler

%load_ext autoreload
%autoreload 2


  from .autonotebook import tqdm as notebook_tqdm
2024-06-25 14:27:03,069	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-06-25 14:27:03,129	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
# import pandas as pd
# import numpy as np
# import os
# import gc
# from sklearn.preprocessing import MultiLabelBinarizer

# def load_dataframe():
#     df = pd.read_csv(f'{os.getcwd()}/data/combined_data.csv')

#     # Extract columns containing "tpm"
#     tpm_columns = [col for col in df.columns if 'tpm' in col]

#     # Create a new DataFrame with only tpm columns
#     df_tpm = df[tpm_columns]

#     # Extract prefixes
#     prefixes = set(col.rsplit('_', 2)[0] for col in tpm_columns)

#     # Calculate the mean of the columns with the same prefix
#     mean_columns = {}
#     for prefix in prefixes:
#         mean_columns[prefix + '_mean'] = df_tpm.filter(like=prefix).mean(axis=1)

#     # Create a DataFrame for the mean values
#     df_means = pd.DataFrame(mean_columns)
#     df.drop(columns=tpm_columns, inplace=True)
#     df = pd.concat([df, df_means], axis=1)

#     del df_means
#     gc.collect()

#     # Identify _mean columns
#     mean_columns = [col for col in df.columns if '_mean' in col]

#     # Drop rows where any of the _mean columns are equal to 0
#     df = df[~(df[mean_columns] == 0).any(axis=1)]

#     # Drop rows with missing upstream200 sequences
#     df = df.dropna(subset=['upstream200'])

#     # Drop rows with upstream200 sequences that contain anything but A, T, C, G
#     df = df[df['upstream200'].apply(lambda x: set(x).issubset({'A', 'T', 'C', 'G'}))]

#     mlb = MultiLabelBinarizer()
#     # Map each species_id to a one-hot encoding
#     df['species_id'] = df['species_id'].apply(lambda x: [x])
#     df['species_id'] = mlb.fit_transform(df['species_id']).tolist()

#     # Map each base to one-hot encoding
#     base_encodings = {'A': [1, 0, 0, 0], 'T': [0, 1, 0, 0], 'C': [0, 0, 1, 0], 'G': [0, 0, 0, 1]}
#     longest_sequence = max(df['upstream200'].apply(lambda x: len(x)))
#     df['upstream200'] = df['upstream200'].apply(lambda x: [base_encodings[base] for base in x] + [[0, 0, 0, 0]] * (longest_sequence - len(x)))

#     df[mean_columns] = df[mean_columns].apply(np.log1p)
#     print("The number of stress conditions is: ", len(mean_columns))
#     # Create a new column 'stress' with a list of dictionaries for each stress condition
#     df['stress'] = df.apply(lambda row: [{prefix: row[f"{prefix}_mean"]} for prefix in prefixes], axis=1)

#     # Drop original mean columns
#     df = df.drop(columns=mean_columns)

#     # Explode the 'stress' column
#     df = df.explode('stress').reset_index(drop=True)

#     # Extract stress names and values
#     df['stress_name'] = df['stress'].apply(lambda x: list(x.keys())[0])
#     df['stress'] = df['stress'].apply(lambda x: list(x.values())[0])

#     # One-hot encode stress names
#     df['stress_name'] = df['stress_name'].apply(lambda x: [x])
#     stress_one_hot_encoded = mlb.fit_transform(df['stress_name'])
#     stress_one_hot_encoded_df = pd.DataFrame(stress_one_hot_encoded, columns=mlb.classes_)

#     # Combine the one-hot encoded columns with the original DataFrame
#     df = pd.concat([df, stress_one_hot_encoded_df], axis=1)

#     return df

In [3]:
# df = load_dataframe()

In [4]:
# def create_dataset(df, val_split=0.2, test_split=0.1):
#     # split the data into training, validation, and testing
#     X_train, X_temp, y_train, y_temp = train_test_split(
#         df[['species_id', 'stress_name', 'upstream200']], df['stress'], test_size=(val_split + test_split))
#     X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_split/(val_split + test_split))

 
#     # Create datasets
#     train_dataset = SequenceDataset(X_train, torch.tensor(y_train.values).float())
#     val_dataset = SequenceDataset(X_val, torch.tensor(y_val.values).float())
#     test_dataset = SequenceDataset(X_test, torch.tensor(y_test.values).float())

#     return train_dataset, val_dataset, test_dataset


In [5]:
import torch
import torch.nn as nn

class VisionTransformer(nn.Module):
    def __init__(self, seq_len=203, num_features=4, dim=64, depth=6, heads=8, mlp_dim=128, dropout=0.1):
        super(VisionTransformer, self).__init__()
        self.seq_len = seq_len
        self.num_features = num_features
        self.dim = dim
        
        # Linear transformation to embed input sequence
        self.embedding = nn.Linear(num_features, dim)
        
        # Positional embedding
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len, dim))
        
        # Transformer Encoder Layers
        self.transformer = nn.Transformer(
            d_model=dim, 
            nhead=heads, 
            num_encoder_layers=depth, 
            dim_feedforward=mlp_dim, 
            dropout=dropout
        )
        
        # Regression head
        self.regression_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, dim)  # Output a feature vector
        )
        
    def forward(self, x):
        # x: (batch_size, seq_len, num_features)
        
        # Embed the input sequence
        x = self.embedding(x)  # (batch_size, seq_len, dim)
        
        # Add positional encoding
        x += self.pos_embedding
        
        # Transformer expects input of shape (seq_len, batch_size, dim)
        x = x.squeeze(1).permute(1, 0, 2)  # (seq_len, batch_size, dim)
        
        # Pass through the transformer
        x = self.transformer(x)  # (seq_len, batch_size, dim)
        
        # Take the mean of the output sequence (pooling)
        x = x.mean(dim=0)  # (batch_size, dim)
        
        # Pass through the regression head
        x = self.regression_head(x)  # (batch_size, dim)
        
        return x

class SimpleViTModel(nn.Module):
    def __init__(self, *kwargs) -> None:
        super().__init__()

        # Hyperparameters
        species_size = kwargs['species_size'] if 'species_size' in kwargs else 30
        stress_condition_size = kwargs['stress_condition_size'] if 'stress_condition_size' in kwargs else 12
        hidden_size = kwargs['hidden_size'] if 'hidden_size' in kwargs else 64
        cnn_filers = kwargs['cnn_filers'] if 'cnn_filers' in kwargs else hidden_size

        # Activation functions
        self.activation = kwargs['activation'] if 'activation' in kwargs else nn.ReLU()

        # Input layers for species and stress condition
        self.input_species = nn.Linear(species_size, hidden_size)
        self.input_stress_condition = nn.Linear(stress_condition_size, hidden_size)

        # Vision Transformer for DNA sequence
        self.vit = VisionTransformer(seq_len=203, num_features=4, dim=hidden_size, depth=6, heads=8, mlp_dim=128, dropout=0.1)

        # Hidden layer
        self.hidden = nn.Linear(hidden_size * 3, hidden_size)

        # Output layer
        self.output = nn.Linear(hidden_size, 1)

    def forward(self, input):
        x_species, x_stress_condition, x_base = input

        # Process species and stress condition
        x_species = self.input_species(x_species)
        x_stress_condition = self.input_stress_condition(x_stress_condition)

        # Process DNA sequence with Vision Transformer
        x_base = self.vit(x_base)

        # Concatenate all features
        x = torch.cat((x_species, x_stress_condition, x_base), dim=1)

        # Hidden layer
        x = self.hidden(x)
        x = self.activation(x)

        # Output layer
        x = self.output(x)
        x = x.squeeze()

        return x

# Example usage
species_size = 32
stress_condition_size = 12
hidden_size = 64
batch_size = 1024

config = {
    'lr': 0.001,
    'batch_size': 1024,
    'epochs': 100,
    'species_id': -1,
    'test_size': 20000,
    'hidden_size': 64,
    'cnn_filters': 100,
    'model_version': 1,
}


model = SimpleViTModel(config)



In [6]:
# Load the data
def load_dataframe(data_df=None):
    if data_df is not None:
        return data_df
    data_df = pd.read_csv(f'{os.getcwd()}/data/combined_data.csv')
    # get mean of each stress condition
    averages_df = data_df.copy()
    stress_conditions = set([name.split('_')[0] for name in data_df.columns if 'TPM' in name])
    for stress in stress_conditions:
        stress_columns = [name for name in data_df.columns if stress+'_' in name]
        averages_df[f'{stress}'] = np.mean([data_df[stress_columns[0]], data_df[stress_columns[1]], data_df[stress_columns[2]]], axis=0)

    # Drop the columns that are not needed
    averages_df = averages_df.drop(columns=[name for name in averages_df.columns if 'TPM' in name] + ['Chromosome','Region','Species', 'Unnamed: 0'])
    # drop rows with missing upstream200 sequences
    averages_df = averages_df.dropna(subset=['upstream200'])
    # drop rows with upstream200 sequences that contain anything but A, T, C, G
    averages_df = averages_df[averages_df['upstream200'].apply(lambda x: set(x).issubset({'A', 'T', 'C', 'G'}))]


    mlb = MultiLabelBinarizer()
    # map each species id to a one hot encoding
    averages_df['Species ID'] = averages_df['Species ID'].apply(lambda x: [x])
    averages_df['Species ID'] = mlb.fit_transform(averages_df['Species ID']).tolist()

    # map each base to one hot encoding
    base_encodings = {'A': [1,0,0,0], 'T': [0,1,0,0], 'C': [0,0,1,0], 'G': [0,0,0,1]}
    longest_sequence = max(averages_df['upstream200'].apply(lambda x: len(x)))
    averages_df['upstream200'] = averages_df['upstream200'].apply(lambda x: [base_encodings[base] for base in x] + [[0,0,0,0]]*(longest_sequence-len(x)))

    # explode dataset to have one row per stress condition
    averages_df['Stress'] = averages_df.apply(lambda row: [{stress:row[stress]} for stress in stress_conditions], axis=1)
    averages_df = averages_df.drop(columns=[name for name in averages_df.columns if name in stress_conditions])

    averages_df = averages_df.explode('Stress')
    averages_df['Stress_name'] = averages_df['Stress'].apply(lambda x: list(x.keys())[0])
    averages_df['Stress'] = averages_df['Stress'].apply(lambda x: list(x.values())[0])

    # one hot encode stress names
    averages_df['Stress_name'] = averages_df['Stress_name'].apply(lambda x: [x])
    averages_df['Stress_name'] = mlb.fit_transform(averages_df['Stress_name']).tolist()

    # drop rows with 0 stress
    averages_df = averages_df[averages_df['Stress'] > 0]

    # log values of stress conditions
    averages_df['Stress'] = averages_df['Stress'].apply(lambda x: np.log(x+1))

    data_df = averages_df
    return averages_df


In [7]:
def load_data(species_id = -1, size = -1 ,val_split = 0.2 , test_split = 0.1, data_df=None):
    data_df = load_dataframe(data_df)
    
    if species_id != -1:
        data_df = data_df[data_df['Species ID'].apply(lambda x: x[species_id] == 1)]
    if size != -1:
        size = int(size * 1.39)
        data_df = data_df.sample(size)

    # data_df["upstream200"] = data_df["upstream200"].apply(lambda x: torch.cat([torch.tensor(x).float(), torch.zeros(224-len(x), 4)]))
    
    # split the data into training and testing
    X_train, X_test, y_train, y_test = train_test_split(data_df[['Species ID', 'Stress_name', 'upstream200']], data_df['Stress'], test_size=test_split)

    
    # create a dataset
    train_dataset = SequenceDataset(X_train, y_train)
    test_dataset = SequenceDataset(X_test, y_test)

    return train_dataset, test_dataset


In [8]:
data_df = load_dataframe()

In [9]:
train_dataset, test_dataset = load_data(data_df=data_df, species_id=config['species_id'], size=config['test_size'])
print(f"Training on {len(train_dataset)} samples, testing on {len(test_dataset)} samples")

train_dataset, val_dataset = train_test_split(train_dataset, test_size=0.2)

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=True)

Training on 25019 samples, testing on 2780 samples


In [10]:
def train(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, targets in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader)}")

def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets in dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
    return total_loss / len(dataloader)



criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Train the model
train(model, train_loader, criterion, optimizer, epochs=10)

# Evaluate the model
val_loss = evaluate(model, val_loader, criterion)
print(f"Validation Loss: {val_loss}")


TypeError: forward() missing 1 required positional argument: 'tgt'