<a href="https://colab.research.google.com/github/nirb28/nn_catalyst/blob/main/src/different_approaches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os, sys

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    from google.colab import userdata
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

Running in Colab!
Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW

# Load the data
descriptors_path = 'descriptors.csv'
targets_path = 'compiled_data.csv'

In [3]:
descriptors_df = pd.read_csv(resolve_path_gdrive(descriptors_path))
targets_df = pd.read_csv(resolve_path_gdrive(targets_path))

  descriptors_df = pd.read_csv(resolve_path_gdrive(descriptors_path))


In [4]:
# Show sample rows
print("\nSample Rows from Descriptors DataFrame:")
print(descriptors_df.head())
print("\nSample Rows from Targets DataFrame:")
print(targets_df.head())


Sample Rows from Descriptors DataFrame:
   Label        ABC     ABCGG  nAcid  nBase             SpAbs_A  \
0   9268   4.719397  5.004088      0      0   6.720566232730447   
1  10488  10.334062  9.836417      0      0  16.752497538971177   
2  25579   5.875634  5.566041      0      0    9.43114762028933   
3   8952   6.611250  6.890735      1      0   10.68725972618713   
4  23681   7.249407  6.976306      0      0  11.945821561028193   

              SpMax_A           SpDiam_A              SpAD_A  \
0  2.1010029896154583  4.202005979230917   6.720566232730447   
1  2.3623398328574394  4.724679665714879  16.752497538971177   
2  2.1753277471610764  4.350655494322151    9.43114762028933   
3    2.28774942353935  4.425414875225794   10.68725972618713   
4  2.2671838628844996     4.534367725769  11.945821561028193   

              SpMAD_A  ...     SRW10     TSRW10          MW        AMW WPath  \
0  0.9600808903900638  ...  8.123558  33.343946  136.047505   6.802375    46   
1  1.196606

In [5]:
# selected column
selected_cols=[5, 14, 15, 23, 24, 25]
number_of_target_cols = len(selected_cols)
selected_cols.insert(0, 0)
targets_df = targets_df.iloc[:, selected_cols]
print(targets_df)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

       mol_num  dipole_n  max_spin_o  dipole_o  max_charge_neg_r  max_spin_r  \
0            1   4.63917    0.349349   7.63914         -0.665933    0.933582   
1            4   0.00005    0.280722   0.00000         -0.449216    0.472025   
2           11   3.31424    0.897425   2.75691         -0.883380    0.395451   
3           12   5.84638    0.838052   4.65300         -0.833701    0.258320   
4           13   4.96208    0.307085  10.87778         -0.848564    0.383838   
...        ...       ...         ...       ...               ...         ...   
26228    34242   4.16353    0.189008   5.82675         -0.630283    0.216071   
26229    34243   5.55982    0.512668   9.34790         -0.401293    0.402216   
26230    34244   6.29952    0.460641   8.96955         -0.435574    0.241265   
26231    34245   3.52367    0.187718   8.74031         -0.656433    0.146653   
26232    34246   5.81051    0.249099   6.71696         -0.470728    0.254265   

       dipole_r  
0       8.80921  
1  

In [6]:
# Keep only numeric columns
descriptors_numeric = descriptors_df.select_dtypes(include=['number'])
targets_numeric = targets_df.select_dtypes(include=['number'])

# Merge the numeric dataframes on the common label column
numeric_data = pd.merge(descriptors_numeric, targets_numeric, left_on='Label', right_on='mol_num')
numeric_data = numeric_data.drop(columns=['Label', 'mol_num'])

# Separate features and targets
X = numeric_data.iloc[:, :-number_of_target_cols]  # Assuming the last 30 columns are targets
y = numeric_data.iloc[:, -number_of_target_cols:]

In [7]:
# Apply variance threshold
selector = VarianceThreshold()
X_high_variance = selector.fit_transform(X)

# Convert to numpy arrays
X = X_high_variance
y = y.values

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the data
scaler_X = StandardScaler().fit(X_train)
scaler_y = StandardScaler().fit(y_train)

X_train = scaler_X.transform(X_train)
X_val = scaler_X.transform(X_val)
X_test = scaler_X.transform(X_test)

y_train = scaler_y.transform(y_train)
y_val = scaler_y.transform(y_val)
y_test = scaler_y.transform(y_test)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32, device=device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32, device=device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32, device=device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32, device=device)

In [8]:
# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the individual model class
class SingleTargetNet(nn.Module):
    def __init__(self, input_size, dropout_rate=0.5):
        super(SingleTargetNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3

def get_target5model():
    # Define the model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], 1024),
        nn.LeakyReLU(),
        nn.Linear(1024, 512),
        nn.LeakyReLU(),
        nn.Linear(512, 256),
        nn.LeakyReLU(),
        nn.Linear(256, 1)
    )
    return model

class RegressionNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RegressionNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [9]:
# Function to train and evaluate individual models
def train_and_evaluate(target_index, model):
    model.to(device=device)
    criterion = nn.MSELoss()
    optimizer = AdamW(model.parameters(), lr=0.001)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

    best_val_loss = np.inf
    patience_counter = 0
    num_epochs = 150

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets[:, target_index])
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Target {target_index} - Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, targets[:, target_index])
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f'Target {target_index} - Validation Loss: {val_loss}')

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), resolve_path_gdrive(f'{selected_cols[target_index+1]}_{type(model).__name__}_model.pth'))
        else:
            patience_counter += 1
            if patience_counter >= 15:
                print(f'Target {target_index} - Early stopping triggered')
                break

    model.load_state_dict(torch.load(resolve_path_gdrive(f'{selected_cols[target_index+1]}_{type(model).__name__}_model.pth')))

    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets[:, target_index])
            test_loss += loss.item()
    test_loss /= len(test_loader)
    print(f'Target {target_index} - Test Loss: {test_loss}')

    return model, test_loss

In [None]:
# Train and evaluate individual models for each target
test_losses = []
models = []
try_models = [SingleTargetNet(X_train.shape[1]), RegressionNetwork(X_train.shape[1], 512, 1), get_target5model()]

for target_index in range(y_train.shape[1]):
    for a_model in try_models:
        model, test_loss = train_and_evaluate(target_index, a_model)
        models.append(model)
        test_losses.append(test_loss)



Target 0 - Epoch 1/150, Loss: 0.8701442719959631
Target 0 - Validation Loss: 0.6447236897741876
Target 0 - Epoch 2/150, Loss: 0.7112080505708369
Target 0 - Validation Loss: 0.6294021166679336
Target 0 - Epoch 3/150, Loss: 0.6976079432430063
Target 0 - Validation Loss: 0.6570531660463752
Target 0 - Epoch 4/150, Loss: 0.6815267054409515
Target 0 - Validation Loss: 0.6103816773833298
Target 0 - Epoch 5/150, Loss: 0.6734436593677212
Target 0 - Validation Loss: 0.6747167470251642
Target 0 - Epoch 6/150, Loss: 0.6652287465361196
Target 0 - Validation Loss: 0.6065102179602879
Target 0 - Epoch 7/150, Loss: 0.6555322981325955
Target 0 - Validation Loss: 0.5915482643900848
Target 0 - Epoch 8/150, Loss: 0.6480946974692549
Target 0 - Validation Loss: 0.5952836405940172
Target 0 - Epoch 9/150, Loss: 0.6403323078028312
Target 0 - Validation Loss: 0.6106407649633361
Target 0 - Epoch 10/150, Loss: 0.6295545377382418
Target 0 - Validation Loss: 0.6198546933691677
Target 0 - Epoch 11/150, Loss: 0.616356

In [None]:
models

[SingleTargetNet(
   (fc1): Linear(in_features=780, out_features=1024, bias=True)
   (bn1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (fc2): Linear(in_features=1024, out_features=512, bias=True)
   (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (fc3): Linear(in_features=512, out_features=1, bias=True)
   (fc_skip): Linear(in_features=1024, out_features=512, bias=True)
   (dropout): Dropout(p=0.5, inplace=False)
 ),
 RegressionNetwork(
   (fc1): Linear(in_features=780, out_features=512, bias=True)
   (bn1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (relu): LeakyReLU(negative_slope=0.01)
   (dropout): Dropout(p=0.1, inplace=False)
   (fc2): Linear(in_features=512, out_features=512, bias=True)
   (bn2): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (fc3): Linear(in_features=512, out_features=1, bias=True)
 ),
 Sequential(
  

In [None]:
# Prepare DataFrames for train, validation, and test predictions
train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

r2_scores, rmse_scores, mae_scores = [], [], []

def compute_stats(target_index, df):
    print(df)
    observed_col = f'Observed_{target_index}'
    predicted_col = f'Predicted_{target_index}'

    # Calculate metrics
    observed = df[observed_col]
    predicted = df[predicted_col]
    r2 = r2_score(observed, predicted)
    rmse = mean_squared_error(observed, predicted, squared=False)
    mae = mean_absolute_error(observed, predicted)
    return r2, rmse, mae

In [None]:
def evaluate(target_index, model):
    # Make predictions on the train, validation, and test sets
    model.cpu().eval()
    with torch.no_grad():
        y_train_pred = model(X_train_tensor.cpu()).numpy()
        y_val_pred = model(X_val_tensor.cpu()).numpy()
        y_test_pred = model(X_test_tensor.cpu()).numpy()

    # Inverse transform the predictions and targets to their original scale
    y_train_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_train_pred.shape[0], target_index)), y_train_pred, np.zeros((y_train_pred.shape[0], y_train.shape[1] - target_index - 1))], axis=1))[:, target_index]
    y_val_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_val_pred.shape[0], target_index)), y_val_pred, np.zeros((y_val_pred.shape[0], y_val.shape[1] - target_index - 1))], axis=1))[:, target_index]
    y_test_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_test_pred.shape[0], target_index)), y_test_pred, np.zeros((y_test_pred.shape[0], y_test.shape[1] - target_index - 1))], axis=1))[:, target_index]

    y_train_orig = scaler_y.inverse_transform(y_train)[:, target_index]
    y_val_orig = scaler_y.inverse_transform(y_val)[:, target_index]
    y_test_orig = scaler_y.inverse_transform(y_test)[:, target_index]

    # Create dataframes for the predictions and actual values
    train_df[f'Observed_{target_index}'] = y_train_orig
    train_df[f'Predicted_{target_index}'] = y_train_pred_orig

    val_df[f'Observed_{target_index}'] = y_val_orig
    val_df[f'Predicted_{target_index}'] = y_val_pred_orig

    test_df[f'Observed_{target_index}'] = y_test_orig
    test_df[f'Predicted_{target_index}'] = y_test_pred_orig

    # Create and insert parity plots for train, validation, and test sets
    r2, rmse, mae = compute_stats(target_index, train_df)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)

for target_index in range(y_train.shape[1]):
    for model_type in try_models:
        model = model_type
        model.load_state_dict(torch.load(resolve_path_gdrive(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth')))
        evaluate(target_index, model)


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     3.456505
1         2.21883     4.470612
2         3.42129     2.142875
3         6.54626     5.350793
4         2.70455     2.034520
...           ...          ...
20981     5.82050     5.254159
20982     2.90117     1.936822
20983     3.06542     3.045043
20984     2.26019     2.698958
20985     1.91199     2.564694

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     3.538626
1         2.21883     4.486116
2         3.42129     1.889467
3         6.54626     4.863782
4         2.70455     2.309811
...           ...          ...
20981     5.82050     5.852612
20982     2.90117     2.274118
20983     3.06542     2.909958
20984     2.26019     2.358000
20985     1.91199     2.619898

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     3.916124
1         2.21883     4.377955
2         3.42129     2.498786
3         6.54626     5.610939
4         2.70455     1.833646
...           ...          ...
20981     5.82050     5.346223
20982     2.90117     1.533721
20983     3.06542     3.034153
20984     2.26019     2.620820
20985     1.91199     2.391123

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0  Observed_1  Predicted_1
0         3.03390     3.916124    0.498762     0.462864
1         2.21883     4.377955    0.162431     0.207503
2         3.42129     2.498786    0.547589     0.507337
3         6.54626     5.610939    0.308542     0.278597
4         2.70455     1.833646    0.572807     0.556839
...           ...          ...         ...          ...
20981     5.82050     5.346223    0.342817     0.361397
20982     2.90117     1.533721    0.485546     0.483713
20983     3.06542     3.034153    0.210382     0.189409
20984     2.26019     2.620820    0.336572     0.332911
20985     1.91199     2.391123    0.211747     0.228168

[20986 rows x 4 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0  Observed_1  Predicted_1
0         3.03390     3.916124    0.498762     0.495368
1         2.21883     4.377955    0.162431     0.153368
2         3.42129     2.498786    0.547589     0.534062
3         6.54626     5.610939    0.308542     0.316824
4         2.70455     1.833646    0.572807     0.554161
...           ...          ...         ...          ...
20981     5.82050     5.346223    0.342817     0.355920
20982     2.90117     1.533721    0.485546     0.465189
20983     3.06542     3.034153    0.210382     0.187322
20984     2.26019     2.620820    0.336572     0.303927
20985     1.91199     2.391123    0.211747     0.210101

[20986 rows x 4 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0  Observed_1  Predicted_1
0         3.03390     3.916124    0.498762     0.342581
1         2.21883     4.377955    0.162431     0.201065
2         3.42129     2.498786    0.547589     0.486283
3         6.54626     5.610939    0.308542     0.289868
4         2.70455     1.833646    0.572807     0.567159
...           ...          ...         ...          ...
20981     5.82050     5.346223    0.342817     0.375852
20982     2.90117     1.533721    0.485546     0.447304
20983     3.06542     3.034153    0.210382     0.191891
20984     2.26019     2.620820    0.336572     0.322709
20985     1.91199     2.391123    0.211747     0.235239

[20986 rows x 4 columns]


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0  Observed_1  Predicted_1  Observed_2  \
0         3.03390     3.916124    0.498762     0.342581     3.89562   
1         2.21883     4.377955    0.162431     0.201065     7.08420   
2         3.42129     2.498786    0.547589     0.486283     6.47881   
3         6.54626     5.610939    0.308542     0.289868     5.39737   
4         2.70455     1.833646    0.572807     0.567159    11.06172   
...           ...          ...         ...          ...         ...   
20981     5.82050     5.346223    0.342817     0.375852     6.03192   
20982     2.90117     1.533721    0.485546     0.447304     3.37806   
20983     3.06542     3.034153    0.210382     0.191891    11.38933   
20984     2.26019     2.620820    0.336572     0.322709     1.31823   
20985     1.91199     2.391123    0.211747     0.235239     2.01098   

       Predicted_2  
0         3.154392  
1         5.779972  
2         5.212570  
3         9.927725  
4        10.053726  
...            ...  


  model.load_state_dict(torch.load(f'{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


FileNotFoundError: [Errno 2] No such file or directory: '15_RegressionNetwork_model.pth'

In [None]:
r2_scores, rmse_scores, mae_scores

([0.602973221905207,
  0.5993249085928642,
  0.6117650030599211,
  0.8808057793434411,
  0.9528224754377352,
  0.8022839179780921,
  0.5932465964354097],
 [1.2525446674336702,
  1.2582863778270408,
  1.2385988244759507,
  0.04086779424231646,
  0.02571113055942406,
  0.052634996317358684,
  1.9412134364527274],
 [0.9528549375792478,
  0.9561028144634287,
  0.9415593225120522,
  0.03026810860835579,
  0.01827938131057719,
  0.03904015856283628,
  1.4662363570828618])