In [8]:
import pandas as pd
import numpy as np
import os
import sys
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
import os.path
from skorch import NeuralNetRegressor
from torch import nn, optim
from skorch.callbacks import EarlyStopping, EpochScoring
import matplotlib.pyplot as plt
import pickle
from sklearn.ensemble import RandomForestRegressor

# Define the current directory if __file__ is not available
current_dir = os.getcwd()  # Gets the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Moves one level up

# Add the parent directory to the Python path
sys.path.insert(0, parent_dir)

from preprocessing import *

In [9]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# 1. Choose the phenotype

In [10]:
phenotype = "YPD_doublingtime"
#phenotype = "YPDCUSO410MM_40h"

# 2. Preprocess the data

In [None]:
X_file = os.path.join(data_dir, f"X_matrix_restricted_{phenotype}.pkl")
Y_file = os.path.join(data_dir, f"data/y_{phenotype}.csv")


x2_df = pd.read_pickle(os.path.join(data_dir, f"data/X_matrix_restricted_{phenotype}.pkl"))


print("moving to y")
y2_df = pd.read_csv(Y_file)

x_data_f = x2_df.drop(x2_df.columns[0], axis=1)
y_data_f = y2_df.drop(y2_df.columns[0], axis=1)

x_data_f, y_data_f = shuffle_dataset(x_data_f, y_data_f)

# 3. Run the model

In [None]:
# Number of input features
n_input_features = x_data_f.shape[1]

# Enhanced Neural Network with More Layers and Neurons
class EnhancedRegressionNet(nn.Module):
    def __init__(self, n_input_features, dropout_rate, n_neurons_1=1024, n_neurons_2=512, n_neurons_3=256, n_neurons_4=256):
        super(EnhancedRegressionNet, self).__init__()
        self.fc1 = nn.Linear(n_input_features, n_neurons_1)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(dropout_rate)

        self.fc2 = nn.Linear(n_neurons_1, n_neurons_2)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(dropout_rate)

        self.fc3 = nn.Linear(n_neurons_2, n_neurons_3)
        self.relu3 = nn.ReLU()
        self.dropout3 = nn.Dropout(dropout_rate)

        self.fc5 = nn.Linear(n_neurons_3, 1)  # Output layer remains the same

    def forward(self, x):
        x = self.relu1(self.fc1(x))
        x = self.dropout1(x)

        x = self.relu2(self.fc2(x))
        x = self.dropout2(x)

        x = self.relu3(self.fc3(x))
        x = self.dropout3(x)

        x = self.fc5(x)
        return x

# Ensure both x_data_f and y_data_f are converted to float32
x_data_f = x_data_f.astype(np.float32)  # Cast features to float32
y_data_f = y_data_f.astype(np.float32)  # Cast target to float32

# Define scoring callbacks for training and validation loss
train_loss = EpochScoring(scoring='neg_mean_squared_error', on_train=True, name='train_loss', lower_is_better=False)
valid_loss = EpochScoring(scoring='neg_mean_squared_error', name='valid_loss', lower_is_better=False)

# Neural Network Regressor
net = NeuralNetRegressor(
    module=EnhancedRegressionNet,
    module__n_input_features=n_input_features,  # n_input_features
    criterion=nn.MSELoss,
    optimizer=optim.Adam,
    optimizer__weight_decay=1e-5,               # L2 regularization
    iterator_train__shuffle=True,
    iterator_train__batch_size=32,
    callbacks=[EarlyStopping(patience=5)],
    verbose=1
)

# Parameter grid
param_grid = {
    'module__dropout_rate': [0.01, 0.1, 0.2],
    'lr': [ 0.0001, 0.00001],
    'max_epochs': [100, 150]
}

# GridSearchCV
grid_search = GridSearchCV(net, param_grid=param_grid, cv=KFold(n_splits=5), scoring='neg_mean_squared_error', n_jobs=2)

In [None]:
y_data_f = y_data_f.to_numpy()
print(y_data_f.shape)

In [None]:
# NO NEED TO RUN, the best parameters are saved

# Fit the grid search
grid_search.fit(x_data_f.values, y_data_f)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [30]:
best_params = {
    'module__dropout_rate': 0.3,  # Single value for dropout rate
    'lr': 0.0001,                # Single value for learning rate
    'max_epochs': 150            # Single value for max epochs
}

# 4. Add L2-regularization

In [None]:
# Training of the Model with More Complexity and L2 Regularization
best_net2 = NeuralNetRegressor(
    module=EnhancedRegressionNet,
    module__n_input_features=n_input_features,
    module__n_neurons_1=2048,
    module__n_neurons_2=1024,                  
    module__n_neurons_3=512,
    module__dropout_rate=best_params['module__dropout_rate'],
    criterion=nn.MSELoss,
    max_epochs=best_params['max_epochs'],
    optimizer=optim.Adam,
    lr=best_params['lr'],
    optimizer__weight_decay=5e-4,              # L2 regularization (Weight Decay)
    iterator_train__shuffle=True,
    callbacks=[EarlyStopping(patience=5), train_loss, valid_loss],
    verbose=1
)

# Train the Final Model
best_net2.fit(x_data_f.values, y_data_f)

# Predictions
Y_pred = best_net2.predict(x_data_f.values)

# Reshape Predictions (if needed)
Y_pred = Y_pred.reshape(-1, 1)

In [None]:
trained_nn = best_net2

# Use the NN to generate predictions
#y_pred = trained_nn.predict(x_data_f.values)
y_pred = Y_pred

# Train a Random Forest on the original input features
rf = RandomForestRegressor(max_depth=40, 
                           max_features='sqrt', 
                           min_samples_leaf = 4, 
                           n_estimators=500, 
                           random_state=42)
rf.fit(x_data_f, y_pred)

# Display feature importances from Random Forest
importances = rf.feature_importances_
for feature_name, importance in zip(x_data_f.columns, importances):
    print(f"Feature: {feature_name}, Importance: {importance}")


In [None]:
# Save the feature importances in a CSV
importances_df = pd.DataFrame({
    'Feature': x_data_f.columns,
    'Importance': importances
})

# Sort the DataFrame by 'Importance' in descending order
importances_df = importances_df.sort_values(by='Importance', ascending=False)

# Save the sorted DataFrame to a CSV file
importances_df.to_csv('../results/mutations_NNrestricted_importance.csv', index=False)

print("Feature importances saved in descending order of importance.")


In [None]:
# Extract feature importances and names
feature_importances = rf.feature_importances_
feature_names = x_data_f.columns

# Combine feature names and their importances
features = list(zip(feature_names, feature_importances))

# Sort features by importance in descending order
sorted_features = sorted(features, key=lambda x: x[1], reverse=True)

# Select the top 20 most important features
top_20_features = sorted_features[:20]

# Split into names and values for plotting
top_20_names, top_20_values = zip(*top_20_features)

# Plot the histogram
plt.figure(figsize=(10, 6))
plt.barh(top_20_names, top_20_values, color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Top 20 Most Important Features (Random Forest)')
plt.gca().invert_yaxis()  # Invert y-axis to show the most important feature on top
plt.tight_layout()
plt.show()
