In [24]:
import os
import sys
import pandas as pd
import numpy as np
import torch
import pyspark.pandas as ps
import torch.nn as nn
import torch.optim as optim
import xgboost as xgb

from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split


%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
notebooks_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebooks_dir, os.pardir))
processed_data_path = f"{project_root}/data/processed_data_lmer.pkl"


# Add the project root directory to sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

try:
    from utils.model_utils import train, predict, evaluate, inverse_normalize
    from utils.process_data import get_processed_data, prepare_datasets, transform_dataframe
    from models.combined_model import SimpleCNN, CombinedModel
    from datasets.combined_dataset import CombinedDataset
    from utils.parameter_tuning import random_search
    from utils.plots import plot_losses, plot_predictions_vs_labels, plot_predictions_vs_labels_by_species\
    , plot_boxplot_predictions_vs_labels, plot_hexbin_predictions_vs_labels, plot_histogram
    print("Import successful")
except ImportError as e:
    print(f"Error importing module: {e}")



Import successful


In [None]:
if os.path.exists(processed_data_path):
    data_df = pd.read_pickle(processed_data_path)
else:
   data_df = get_processed_data(project_root) 

In [6]:
data_df = transform_dataframe(data_df, False)
data_df.rename(columns={"stress_name": 'condition', 'stress': 'tpm'}, inplace=True)
data_df.head()

Unnamed: 0,species,upstream200,CCT,CTT,TTC,TCC,CCA,CAA,AAG,AGC,...,CAK,AKT,KTC,AAN,ANG,TTR,TRG,RGA,tpm,condition
0,0,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...",3.0,7.0,9.0,7.0,8.0,6.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.026784,0
1,0,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...",3.0,7.0,9.0,7.0,8.0,6.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.617587,6
2,0,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...",3.0,7.0,9.0,7.0,8.0,6.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.445845,5
3,0,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...",3.0,7.0,9.0,7.0,8.0,6.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.922333,8
4,0,"[[0, 0, 1, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0,...",3.0,7.0,9.0,7.0,8.0,6.0,5.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.013922,1


In [18]:
# Separate features and target
X = data_df.drop(columns=['tpm'])
y = data_df['tpm']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Reset the indices of the DataFrames
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# Split X into tabular and sequence data
X_train_tabular = X_train.drop(columns=['upstream200'])
X_val_tabular = X_val.drop(columns=['upstream200'])
X_test_tabular = X_test.drop(columns=['upstream200'])

X_train_sequence = np.array(X_train['upstream200'].to_list())
X_test_sequence = np.array(X_test['upstream200'].to_list())
X_val_sequence = np.array(X_test['upstream200'].to_list())

# Convert to PyTorch tensors
X_train_sequence = torch.tensor(X_train_sequence, dtype=torch.float32)
X_test_sequence = torch.tensor(X_test_sequence, dtype=torch.float32)
X_val_sequence = torch.tensor(X_val_sequence, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)


In [14]:
X_train_tabular

Unnamed: 0,species,CCT,CTT,TTC,TCC,CCA,CAA,AAG,AGC,GCT,...,KCA,CAK,AKT,KTC,AAN,ANG,TTR,TRG,RGA,condition
0,9,1.0,4.0,1.0,0.0,1.0,4.0,4.0,4.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
1,10,1.0,4.0,2.0,1.0,2.0,4.0,3.0,3.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,4,2.0,2.0,3.0,1.0,1.0,1.0,5.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
3,13,3.0,5.0,7.0,5.0,3.0,3.0,1.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,4,0.0,2.0,5.0,4.0,2.0,2.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
777654,7,3.0,4.0,3.0,2.0,5.0,4.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
777655,28,1.0,0.0,4.0,2.0,0.0,2.0,3.0,1.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
777656,3,1.0,3.0,6.0,3.0,1.0,5.0,3.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
777657,21,7.0,0.0,2.0,1.0,2.0,2.0,0.0,6.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8


In [20]:
batch_size = 32

train_combined_dataset = CombinedDataset(X_train_tabular, X_train_sequence, y_train_tensor)
test_combined_dataset = CombinedDataset(X_test_tabular, X_test_sequence, y_test_tensor)
val_combined_dataset = CombinedDataset(X_val_tabular, X_val_sequence, y_val_tensor)

train_combined_loader = DataLoader(train_combined_dataset, batch_size=batch_size, shuffle=True)
test_combined_loader = DataLoader(test_combined_dataset, batch_size=batch_size)
val_combined_loader = DataLoader(val_combined_dataset, batch_size=batch_size)


In [21]:
# Parameters for the CNN model
num_features = 4  # Number of features per one-hot vector (channels for Conv1d)
cnn_filters = 16
hidden_size = 64
seq_length = 203  # Sequence length

# Initialize the CNN model
cnn_model = SimpleCNN(num_features, cnn_filters, hidden_size, seq_length)


In [27]:
# Parameters for the combined model
column_names = X_train_tabular.columns.tolist()

# Initialize the XGBoost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train_tabular, y_train)

# Initialize the combined model
combined_model = CombinedModel(xgb_model, cnn_model, hidden_size, column_names)
combined_model.to('cpu')  # Change to 'cuda' if you are using GPU

# Define loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(combined_model.parameters(), lr=0.001)

In [29]:
y_pred_xgboost_vanilla = xgb_model.predict(X_test_tabular)
mse = mean_squared_error(y_test, y_pred_xgboost_vanilla)
r2 = r2_score(y_test, y_pred_xgboost_vanilla)
print(f"mse: {mse:.4f}")
print(f"r2: {r2:.4f}")


mse: 0.6580
r2: 0.1493


In [None]:
# Training loop for the combined model
num_epochs = 5
for epoch in range(num_epochs):
    combined_model.train()
    running_loss = 0.0
    for x_tabular, x_sequence, labels in train_combined_loader:
        x_tabular, x_sequence, labels = x_tabular.to('cpu'), x_sequence.to('cpu'), labels.to('cpu')  # Change to 'cuda' if you are using GPU

        optimizer.zero_grad()
        outputs = combined_model(x_tabular, x_sequence)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    train_loss = running_loss / len(train_combined_loader)
    
    # Validation loss calculation
    combined_model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for x_tabular, x_sequence, labels in val_combined_loader:
            x_tabular, x_sequence, labels = x_tabular.to('cpu'), x_sequence.to('cpu'), labels.to('cpu')  # Change to 'cuda' if you are using GPU

            outputs = combined_model(x_tabular, x_sequence)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
    val_loss /= len(val_combined_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

print('Finished Training')