In [None]:
import numpy as np
import sys
import os
import torch
from torch import Tensor
import pandas as pd
import torch.nn as nn
from torch.utils.data import random_split, DataLoader, TensorDataset
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr

# Define the current directory if __file__ is not available
current_dir = os.getcwd()  # Gets the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Moves one level up

# Add the parent directory to the Python path
sys.path.insert(0, parent_dir)

from preprocessing import *

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

X_file = os.path.join(data_dir, "data/X_matrix.csv")
Y_file = os.path.join(data_dir, "data/Y_matrix.csv")

print("Loading the data...")
x_df = pd.read_csv(X_file)
y_df = pd.read_csv(Y_file)
x_data_f, y_data_f = preprocessed_data(x_df, y_df)

Loading the data...


ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
# Convert Input and Output data to Tensors and create a TensorDataset
# Wrap the input and output tensors into a single dataset object

# Préparation des données
print("Préparation des données...")
x = x_data_f.drop(columns=["Yeast_ID"]).fillna(0)  # Remplacer les valeurs manquantes par 0 dans X
y = y_data_f["YPD_doublingtime"].fillna(y_data_f["YPD_doublingtime"].mean())  # Remplacer les valeurs manquantes par la moyenne dans Y

# Ensure x_data_f is entirely numeric and of dtype float32
x = x.values.astype('float32')  # Force conversion to float32

# Ensure y_data_f is entirely numeric and of dtype int64
y = y.values.flatten().astype('int64')  # Flatten and force conversion to int64

input = torch.tensor(x)      # type torch.float32
print('\nInput format: ', input.shape, input.dtype)
output = torch.tensor(y)       # type torch.int64
print('Output format: ', output.shape, output.dtype)
data = TensorDataset(input, output)    # Create a torch.utils.data.TensorDataset object for further data manipulation

Préparation des données...

Input format:  torch.Size([792, 341957]) torch.float32
Output format:  torch.Size([792]) torch.int64


In [None]:
# Split to Train, Validate and Test sets using random_split
train_batch_size = 8
number_rows = len(input)
test_split = int(number_rows * 0.3)
validate_split = int(number_rows * 0.2)
train_split = number_rows - test_split - validate_split
train_set, validate_set, test_set = random_split(data, [train_split, validate_split, test_split])

In [None]:
# Create Dataloader to read the data within batch sizes and put into memory
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True)
validate_loader = DataLoader(validate_set, batch_size=1)
test_loader = DataLoader(test_set, batch_size=1)

In [None]:
# Define model parameters
input_size = list(input.shape)[1]
learning_rate = 0.025
output_size = 1

print(input_size, output_size)

# Define neural network

class Network(nn.Module):
    def __init__(self, layer_sizes):
        super(Network, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(len(layer_sizes) - 1):
            self.layers.append(nn.Linear(layer_sizes[i], layer_sizes[i + 1]))
            if i < len(layer_sizes) - 2:  # Add dropout to intermediate layers
                self.layers.append(nn.Dropout(0.5))

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            if isinstance(layer, nn.Linear):
                x = F.relu(x)  # Activation after Linear layers
        return x


'''
class Network(nn.Module):
    def __init__(self, input_size, output_size):
        super(Network, self).__init__()
        # Define layers with Linear and Dropout
        self.layer1 = nn.Linear(input_size, 7500)
        self.dropout1 = nn.Dropout(0.5)  # Dropout after layer1

        self.layer2 = nn.Linear(7500, 7000)
        self.dropout2 = nn.Dropout(0.5)  # Dropout after layer2

        self.layer3 = nn.Linear(7000, 6000)
        self.dropout3 = nn.Dropout(0.5)  # Dropout after layer3

        self.layer4 = nn.Linear(6000, 5000)
        self.dropout4 = nn.Dropout(0.5)  # Dropout after layer4

        self.final_layer = nn.Linear(5000, output_size)  # Final output layer


    def forward(self, x):

        # Pass through layer1
        x = F.relu(self.layer1(x))
        x = self.dropout1(x)

        # Pass through layer2
        x = F.relu(self.layer2(x))
        x = self.dropout2(x)

        # Pass through layer3
        x = F.relu(self.layer3(x))
        x = self.dropout3(x)

        # Pass through layer4
        x = F.relu(self.layer4(x))
        x = self.dropout4(x)

        # Final layer (no activation here for regression)
        x = self.final_layer(x)



'''

layer_sizes = [input_size, 7500, 7000, 6000, 5000, output_size]
model = Network(layer_sizes)      

# Define your execution device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("The model will be running on", device, "device\n")
model.to(device)

# Function to save the model
def saveModel():
    path = "./NN_regmat_PA.pth"
    torch.save(model.state_dict(), path)

341957 1
The model will be running on cpu device



In [None]:
# Define the loss function with MSE loss and an optimizer with Adam optimizer
loss_fn = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=0.0005, weight_decay=0.1)

In [None]:
from tqdm import tqdm

# Training Function
def train(num_epochs):
    best_accuracy = 0.0
    val_epoch = 5
    print("Begin training...")
    for epoch in tqdm(range(1, num_epochs + 1)):
        running_train_loss = 0.0
        running_accuracy = 0.0
        running_vall_loss = 0.0
        total = 0

        # Training Loop
        for data in train_loader:
            # for data in enumerate(train_loader, 0):
            inputs, outputs = data  # get the input and real species as outputs; data is a list of [inputs, outputs]
            inputs = inputs.float().to(device)
            outputs = outputs.float().to(device)
            optimizer.zero_grad()  # zero the parameter gradients
            torch.autograd.set_detect_anomaly(True)
            predicted_outputs = model(inputs)  # predict output from the model

            outputs = outputs.view(-1,1)
            train_loss = loss_fn(predicted_outputs, outputs)  # calculate loss for the predicted output

            train_loss.backward()  # backpropagate the loss

            optimizer.step()  # adjust parameters based on the calculated gradients
            running_train_loss += train_loss.item()  # track the loss value
        # Calculate training loss value

        train_loss_value = running_train_loss / len(train_loader)
        if (epoch % val_epoch == 0) and (epoch > 0):
            # Validation Loop
            with torch.no_grad():
                model.eval()
                for data in validate_loader:
                    inputs, outputs = data
                    inputs = inputs.float()
                    outputs = outputs.float()
                    predicted_outputs = model(inputs)
                    outputs.view(-1,1)
                    val_loss = loss_fn(predicted_outputs, outputs)

                # The label with the highest value will be our prediction
                    running_vall_loss += val_loss.item()
                    total += outputs.size(0)
                    running_accuracy += (pearsonr(outputs[0].cpu().detach().numpy(), predicted_outputs[0].cpu().detach().numpy())[0])

                # Calculate validation loss value
            val_loss_value = running_vall_loss / len(validate_loader)

            # Calculate accuracy as the number of average Pearson's coefficient in the validation batch divided by the total number of predictions done.
            accuracy = (100 * running_accuracy / total)

            # Save the model if the accuracy is the best
            if accuracy > best_accuracy:
                saveModel()
                best_accuracy = accuracy

            # Print the statistics of the epoch
            print('Completed training batch', epoch, 'Training Loss is: %4f' % train_loss_value,
                  'Validation Loss is: %.4f' % val_loss_value, 'Accuracy is %d %%' % accuracy)

    return predicted_outputs, outputs, running_accuracy

In [None]:
# Function to test the model
def test():
    # Load the model that we saved at the end of the training loop
    model = Network(input_size, output_size)
    path = "./NN_regmat_PA.pth"
    model.load_state_dict(torch.load(path))
    predicted_output_matrix = []
    output_matrix = []
    running_accuracy = 0
    total = 0

    with torch.no_grad():
        for data in test_loader:
            inputs, outputs = data
            inputs = inputs.float()
            outputs = outputs.float()
            output_matrix.append(outputs)
            predicted_outputs = model(inputs)
            predicted_output_matrix.append(predicted_outputs)
            _, predicted = torch.max(predicted_outputs, 1)
            total += outputs.size(0)
            running_accuracy += pearsonr(outputs[0].cpu().detach().numpy(),predicted_outputs[0].cpu().detach().numpy())[0]

        print('Accuracy of the model based on the test set of', test_split,
              'inputs is: %d %%' % (100 * running_accuracy / total))

    return output_matrix, predicted_output_matrix

In [None]:
num_epochs = 5
train_output = train(num_epochs)
print('Finished Training\n')

Begin training...


  0%|          | 0/5 [00:00<?, ?it/s]




NameError: name 'train_loader' is not defined

In [None]:
test_output = test()


for k in range(0, len(test_output[0])):
    test_output[0][k] = test_output[0][k].tolist()[0]
    test_output[1][k] = test_output[1][k].tolist()[0]

In [None]:
test_matrix = np.reshape(test_output[0], (test_split, output_size)).T
test_matrix_predicted = np.reshape(test_output[1], (test_split, output_size)).T

pd.DataFrame(test_matrix).to_csv('Test_matrix_true_using_PA.csv')
pd.DataFrame(test_matrix_predicted).to_csv('Test_matrix_predicted_using_PA.csv')

In [None]:
def predict_on_full_data(model, x_data, device):
    model.eval()  # Set model to evaluation mode
    x_data = torch.tensor(x_data.values, dtype=torch.float32).to(device)
    with torch.no_grad():
        predictions = model(x_data)
    return predictions.cpu().numpy()

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(predictions, targets):
    mse = mean_squared_error(targets, predictions)
    r2 = r2_score(targets, predictions)
    return mse, r2

# Compute metrics
mse, r2 = evaluate_model(predictions, y_train.values)
print(f"MSE: {mse}, R²: {r2}")