In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os
import sys
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Define the current directory if __file__ is not available
current_dir = os.getcwd()  # Gets the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Moves one level up

# Add the parent directory to the Python path
sys.path.insert(0, parent_dir)

from preprocessing import *

In [2]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

In [7]:
X = os.path.join(data_dir, "data/X_matrix_YPD_doublingtime.csv")
Y = os.path.join(data_dir, "data/y_YPD_doublingtime.csv")

print("Loading the data...")

# Load the column names (header)
column_names = np.genfromtxt(X, delimiter=',', max_rows=1, dtype=str)[1:]  # Skip the first column if it's row names

# Read the first line of the file to determine the number of columns
with open(X, 'r') as f:
    first_line = f.readline()
    end_column_index = len(first_line.split(','))  # Count the number of columns in the first line

# Load the row names (index) from the first column and the data (excluding first column)
data = np.loadtxt(X, delimiter=',', skiprows=1, usecols=range(1, end_column_index))
row_names = np.loadtxt(X, delimiter=',', skiprows=1, usecols=0, dtype=str)

# Create the DataFrame
X_data = pd.DataFrame(data, index=row_names, columns=column_names)
y_file = pd.read_csv(Y)

y_data = y_file.drop(columns=["Yeast_ID"])
print(f"Dimensions de X : {X_data.shape}")
print(f"Dimensions de Y : {y_data.shape}")

Loading the data...
Dimensions de X : (792, 347952)
Dimensions de Y : (792, 1)


In [8]:
X_data_f, y_data_f = preprocessed_data(X_data, y_data)

The DataFrame does not contain any NaN values.
1
The DataFrame contains NaN values.
2


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [21]:
from torch.utils.data import Dataset

class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [22]:
from transformers import BertForSequenceClassification
import torch
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader
from torch.optim import AdamW

def train_bert_on_fold(train_loader, val_loader, model, optimizer, num_epochs=3):
    model.train()
    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].squeeze(1)
            attention_mask = batch['attention_mask'].squeeze(1)
            labels = batch['labels']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].squeeze(1)
            attention_mask = batch['attention_mask'].squeeze(1)
            labels = batch['labels']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    return accuracy


In [23]:
from sklearn.model_selection import KFold
import shap
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score

def cross_validate_bert(X, y, tokenizer, max_len, model, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    accuracies = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        train_dataset = BertDataset(X_train, y_train, tokenizer, max_len)
        val_dataset = BertDataset(X_val, y_val, tokenizer, max_len)

        train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=8)

        # Model initialization
        model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
        optimizer = AdamW(model.parameters(), lr=1e-5)

        accuracy = train_bert_on_fold(train_loader, val_loader, model, optimizer)
        accuracies.append(accuracy)
        print(f'Fold Accuracy: {accuracy:.4f}')

    mean_accuracy = np.mean(accuracies)
    print(f'Mean Accuracy across {k} folds: {mean_accuracy:.4f}')

    return mean_accuracy


ModuleNotFoundError: No module named 'shap'

In [24]:
def shap_analysis_for_bert(X_val, model, tokenizer):
    # Apply SHAP for feature importances (using Tokenizer features)
    explainer = shap.Explainer(model, tokenizer)
    shap_values = explainer(X_val)  # Use validation set for SHAP
    shap.summary_plot(shap_values, X_val)


In [25]:
# Prepare the data
X = X_data_f[1:].values
y = y_data_f["YPD_doublingtime"].values

# Tokenizer and Model Setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 128  # Max token length for BERT

# Cross-Validation for BERT Model
mean_accuracy = cross_validate_bert(X, y, tokenizer, max_len, BertForSequenceClassification.from_pretrained('bert-base-uncased'))
print(f'Mean Accuracy across all folds: {mean_accuracy:.4f}')

# SHAP analysis (Optional)
# Apply SHAP to interpret the model for the last fold (or all data)
shap_analysis_for_bert(X, BertForSequenceClassification.from_pretrained('bert-base-uncased'), tokenizer)

NameError: name 'X_data_f' is not defined