In [1]:
import wandb, os, sys

IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
    print("Running in Colab!")
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    from google.colab import userdata
    WANDB_KEY = userdata.get('WANDB_KEY')
    wandb.login(key=WANDB_KEY)
else:
    print("Not running in Colab.")

def resolve_path_gdrive(relativePath):
    if os.path.exists('/content/drive'):
        return '/content/drive/MyDrive/work/gdrive-workspaces/git/nn_catalyst/' + relativePath
    else:
        from utils import get_project_root
        return get_project_root() + "/" + relativePath

wandb.init(project="nn_catalyst")

Not running in Colab.


wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: nirbaanm (nirb_ds). Use `wandb login --relogin` to force relogin


In [2]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.optim import AdamW

# Load the data
descriptors_path = 'descriptors.csv'
targets_path = 'compiled_data.csv'

In [3]:
descriptors_df = pd.read_csv(resolve_path_gdrive(descriptors_path))
targets_df = pd.read_csv(resolve_path_gdrive(targets_path))

  descriptors_df = pd.read_csv(resolve_path_gdrive(descriptors_path))


In [4]:
# Show sample rows
print("\nSample Rows from Descriptors DataFrame:")
print(descriptors_df.head())
print("\nSample Rows from Targets DataFrame:")
print(targets_df.head())


Sample Rows from Descriptors DataFrame:
   Label        ABC     ABCGG  nAcid  nBase             SpAbs_A  \
0   9268   4.719397  5.004088      0      0   6.720566232730447   
1  10488  10.334062  9.836417      0      0  16.752497538971177   
2  25579   5.875634  5.566041      0      0    9.43114762028933   
3   8952   6.611250  6.890735      1      0   10.68725972618713   
4  23681   7.249407  6.976306      0      0  11.945821561028193   

              SpMax_A           SpDiam_A              SpAD_A  \
0  2.1010029896154583  4.202005979230917   6.720566232730447   
1  2.3623398328574394  4.724679665714879  16.752497538971177   
2  2.1753277471610764  4.350655494322151    9.43114762028933   
3    2.28774942353935  4.425414875225794   10.68725972618713   
4  2.2671838628844996     4.534367725769  11.945821561028193   

              SpMAD_A  ...     SRW10     TSRW10          MW        AMW WPath  \
0  0.9600808903900638  ...  8.123558  33.343946  136.047505   6.802375    46   
1  1.196606

In [5]:
# selected column
selected_cols=[5, 7]
number_of_target_cols = len(selected_cols)
selected_cols.insert(0, 0)
targets_df = targets_df.iloc[:, selected_cols]
print(targets_df)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
wandb.log({"target cols": selected_cols})

       mol_num  dipole_n    elec_en_n
0            1   4.63917 -1300.941844
1            4   0.00005 -1744.181119
2           11   3.31424 -2467.453515
3           12   5.84638 -2899.436209
4           13   4.96208 -2750.344206
...        ...       ...          ...
26228    34242   4.16353 -1379.200764
26229    34243   5.55982  -901.076720
26230    34244   6.29952  -901.092143
26231    34245   3.52367  -817.351471
26232    34246   5.81051  -761.984410

[26233 rows x 3 columns]
cuda


In [6]:
# Keep only numeric columns
descriptors_numeric = descriptors_df.select_dtypes(include=['number'])
targets_numeric = targets_df.select_dtypes(include=['number'])

# Merge the numeric dataframes on the common label column
numeric_data = pd.merge(descriptors_numeric, targets_numeric, left_on='Label', right_on='mol_num')
numeric_data = numeric_data.drop(columns=['Label', 'mol_num'])

# Separate features and targets
X = numeric_data.iloc[:, :-number_of_target_cols]  # Assuming the last 30 columns are targets
y = numeric_data.iloc[:, -number_of_target_cols:]

In [7]:
# Apply variance threshold
selector = VarianceThreshold()
X_high_variance = selector.fit_transform(X)

# Convert to numpy arrays
X = X_high_variance
y = y.values

# Split the data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Standardize the data
scaler_X = StandardScaler().fit(X_train)
scaler_y = StandardScaler().fit(y_train)

X_train = scaler_X.transform(X_train)
X_val = scaler_X.transform(X_val)
X_test = scaler_X.transform(X_test)

y_train = scaler_y.transform(y_train)
y_val = scaler_y.transform(y_val)
y_test = scaler_y.transform(y_test)

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32, device=device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32, device=device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32, device=device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32, device=device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32, device=device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32, device=device)

In [8]:
# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the individual model class
class SingleTargetNet(nn.Module):
    def __init__(self, input_size, dropout_rate=0.5):
        super(SingleTargetNet, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.fc3 = nn.Linear(512, 1)
        self.fc_skip = nn.Linear(1024, 512)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        x1 = F.relu(self.bn1(self.fc1(x)))
        x1 = self.dropout(x1)

        x2 = F.relu(self.bn2(self.fc2(x1)))
        x2 = self.dropout(x2)

        # Skip connection
        x2 += self.fc_skip(x1)

        x3 = self.fc3(x2)
        return x3

def get_target5model():
    # Define the model
    model = nn.Sequential(
        nn.Linear(X_train.shape[1], 1024),
        nn.LeakyReLU(),
        nn.Linear(1024, 512),
        nn.LeakyReLU(),
        nn.Linear(512, 256),
        nn.LeakyReLU(),
        nn.Linear(256, 1)
    )
    return model

class RegressionNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RegressionNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.relu = nn.LeakyReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.bn2 = nn.BatchNorm1d(hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out

In [9]:
# Function to train and evaluate individual models
def train_and_evaluate(target_index, model):
    model.to(device=device)
    criterion = nn.MSELoss()
    optimizer = AdamW(model.parameters(), lr=0.001)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

    best_val_loss = np.inf
    patience_counter = 0
    num_epochs = 150

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets[:, target_index])
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f'Target {target_index} - Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}')

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, targets[:, target_index])
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f'Target {target_index} - Validation Loss: {val_loss}')

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'saved_models/{selected_cols[target_index+1]}_{type(model).__name__}_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= 15:
                print(f'Target {target_index} - Early stopping triggered')
                break

    model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model).__name__}_model.pth'))

    model.eval()
    test_loss = 0.0
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets[:, target_index])
            test_loss += loss.item()
    test_loss /= len(test_loader)
    print(f'Target {target_index} - Test Loss: {test_loss}')
    
    return model, test_loss

In [10]:
# Train and evaluate individual models for each target
test_losses = []
models = []
try_models = [SingleTargetNet(X_train.shape[1]), RegressionNetwork(X_train.shape[1], 512, 1), get_target5model()]

for target_index in range(y_train.shape[1]):
    for a_model in try_models:
        model, test_loss = train_and_evaluate(target_index, a_model)
        models.append(model)
        test_losses.append(test_loss)



Target 0 - Epoch 1/150, Loss: 0.8821764745090793
Target 0 - Validation Loss: 0.7255999791186031
Target 0 - Epoch 2/150, Loss: 0.7133631842528901
Target 0 - Validation Loss: 0.6440560901310386
Target 0 - Epoch 3/150, Loss: 0.6900039881740402
Target 0 - Validation Loss: 0.6864639480666417
Target 0 - Epoch 4/150, Loss: 0.6833070060238242
Target 0 - Validation Loss: 0.5836994996884974
Target 0 - Epoch 5/150, Loss: 0.6714070749173804
Target 0 - Validation Loss: 0.6238774381032804
Target 0 - Epoch 6/150, Loss: 0.6636665392121891
Target 0 - Validation Loss: 0.613833940610653
Target 0 - Epoch 7/150, Loss: 0.6513672450602781
Target 0 - Validation Loss: 0.5813338414924901
Target 0 - Epoch 8/150, Loss: 0.6444684184815098
Target 0 - Validation Loss: 0.6024143201548878
Target 0 - Epoch 9/150, Loss: 0.638673263147655
Target 0 - Validation Loss: 0.6006857180377332
Target 0 - Epoch 10/150, Loss: 0.6321541483445865
Target 0 - Validation Loss: 0.6134585680031195
Target 0 - Epoch 11/150, Loss: 0.62195393

  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model).__name__}_model.pth'))


Target 0 - Test Loss: 0.5306489605729173
Target 0 - Epoch 1/150, Loss: 0.7417657053461526
Target 0 - Validation Loss: 0.6199385311181952
Target 0 - Epoch 2/150, Loss: 0.6546583236081571
Target 0 - Validation Loss: 0.6047693999802194
Target 0 - Epoch 3/150, Loss: 0.6139706583784484
Target 0 - Validation Loss: 0.5779262301398487
Target 0 - Epoch 4/150, Loss: 0.5963613324594207
Target 0 - Validation Loss: 0.559201775527582
Target 0 - Epoch 5/150, Loss: 0.5848870467758034
Target 0 - Validation Loss: 0.5636098624729529
Target 0 - Epoch 6/150, Loss: 0.5677061638136099
Target 0 - Validation Loss: 0.5555103123188019
Target 0 - Epoch 7/150, Loss: 0.5578996888475447
Target 0 - Validation Loss: 0.6001360997921084
Target 0 - Epoch 8/150, Loss: 0.5452391363271489
Target 0 - Validation Loss: 0.549716302534429
Target 0 - Epoch 9/150, Loss: 0.5344898663461208
Target 0 - Validation Loss: 0.5414270446067904
Target 0 - Epoch 10/150, Loss: 0.5216061820889392
Target 0 - Validation Loss: 0.5564445420736219


In [33]:
# Prepare DataFrames for train, validation, and test predictions
train_df = pd.DataFrame()
val_df = pd.DataFrame()
test_df = pd.DataFrame()

r2_scores, rmse_scores, mae_scores = [], [], []

def compute_stats(target_index, df):
    print(df)
    observed_col = f'Observed_{target_index}'
    predicted_col = f'Predicted_{target_index}'

    # Calculate metrics
    observed = df[observed_col]
    predicted = df[predicted_col]
    r2 = r2_score(observed, predicted)
    rmse = mean_squared_error(observed, predicted, squared=False)
    mae = mean_absolute_error(observed, predicted)
    return r2, rmse, mae

In [34]:
def evaluate(target_index, model):
    # Make predictions on the train, validation, and test sets
    model.cpu().eval()
    with torch.no_grad():
        y_train_pred = model(X_train_tensor.cpu()).numpy()
        y_val_pred = model(X_val_tensor.cpu()).numpy()
        y_test_pred = model(X_test_tensor.cpu()).numpy()

    # Inverse transform the predictions and targets to their original scale
    y_train_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_train_pred.shape[0], target_index)), y_train_pred, np.zeros((y_train_pred.shape[0], y_train.shape[1] - target_index - 1))], axis=1))[:, target_index]
    y_val_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_val_pred.shape[0], target_index)), y_val_pred, np.zeros((y_val_pred.shape[0], y_val.shape[1] - target_index - 1))], axis=1))[:, target_index]
    y_test_pred_orig = scaler_y.inverse_transform(np.concatenate([np.zeros((y_test_pred.shape[0], target_index)), y_test_pred, np.zeros((y_test_pred.shape[0], y_test.shape[1] - target_index - 1))], axis=1))[:, target_index]

    y_train_orig = scaler_y.inverse_transform(y_train)[:, target_index]
    y_val_orig = scaler_y.inverse_transform(y_val)[:, target_index]
    y_test_orig = scaler_y.inverse_transform(y_test)[:, target_index]

    # Create dataframes for the predictions and actual values
    train_df[f'Observed_{target_index}'] = y_train_orig
    train_df[f'Predicted_{target_index}'] = y_train_pred_orig

    val_df[f'Observed_{target_index}'] = y_val_orig
    val_df[f'Predicted_{target_index}'] = y_val_pred_orig

    test_df[f'Observed_{target_index}'] = y_test_orig
    test_df[f'Predicted_{target_index}'] = y_test_pred_orig

    # Create and insert parity plots for train, validation, and test sets
    r2, rmse, mae = compute_stats(target_index, train_df)
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)
    
for target_index in range(y_train.shape[1]):
    for model_type in try_models:
        model = model_type
        model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))
        evaluate(target_index, model)


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     3.082591
1         2.21883     4.235621
2         3.42129     2.422284
3         6.54626     5.230917
4         2.70455     1.754217
...           ...          ...
20981     5.82050     5.689668
20982     2.90117     1.622211
20983     3.06542     2.767232
20984     2.26019     3.166793
20985     1.91199     2.596315

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     2.975591
1         2.21883     4.135766
2         3.42129     2.188055
3         6.54626     5.103771
4         2.70455     2.369534
...           ...          ...
20981     5.82050     5.755203
20982     2.90117     2.495946
20983     3.06542     2.669476
20984     2.26019     3.083359
20985     1.91199     3.444121

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0
0         3.03390     3.771074
1         2.21883     4.556002
2         3.42129     2.045143
3         6.54626     4.579337
4         2.70455     2.149337
...           ...          ...
20981     5.82050     5.412588
20982     2.90117     1.299581
20983     3.06542     3.274291
20984     2.26019     2.644899
20985     1.91199     2.654082

[20986 rows x 2 columns]


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0   Observed_1  Predicted_1
0         3.03390     3.771074  -790.209081  -718.851011
1         2.21883     4.556002 -1743.254888 -1664.259431
2         3.42129     2.045143  -773.595452  -726.539389
3         6.54626     4.579337 -1858.483568 -1689.650977
4         2.70455     2.149337 -1007.672410 -1000.322250
...           ...          ...          ...          ...
20981     5.82050     5.412588  -731.087227  -704.388694
20982     2.90117     1.299581  -789.237794  -678.949675
20983     3.06542     3.274291 -1249.796038 -1182.092716
20984     2.26019     2.644899 -4151.577552 -4070.493783
20985     1.91199     2.654082 -1156.391383 -1063.690205

[20986 rows x 4 columns]


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0   Observed_1  Predicted_1
0         3.03390     3.771074  -790.209081  -794.182216
1         2.21883     4.556002 -1743.254888 -1739.980583
2         3.42129     2.045143  -773.595452  -769.195854
3         6.54626     4.579337 -1858.483568 -1804.832486
4         2.70455     2.149337 -1007.672410 -1032.352433
...           ...          ...          ...          ...
20981     5.82050     5.412588  -731.087227  -733.800019
20982     2.90117     1.299581  -789.237794  -709.479636
20983     3.06542     3.274291 -1249.796038 -1250.247578
20984     2.26019     2.644899 -4151.577552 -4170.933592
20985     1.91199     2.654082 -1156.391383 -1182.890387

[20986 rows x 4 columns]


  model.load_state_dict(torch.load(f'saved_models/{selected_cols[target_index+1]}_{type(model_type).__name__}_model.pth'))


       Observed_0  Predicted_0   Observed_1  Predicted_1
0         3.03390     3.771074  -790.209081  -781.549119
1         2.21883     4.556002 -1743.254888 -1713.002194
2         3.42129     2.045143  -773.595452  -761.440171
3         6.54626     4.579337 -1858.483568 -1851.134158
4         2.70455     2.149337 -1007.672410  -983.590770
...           ...          ...          ...          ...
20981     5.82050     5.412588  -731.087227  -705.291108
20982     2.90117     1.299581  -789.237794  -757.857467
20983     3.06542     3.274291 -1249.796038 -1245.888090
20984     2.26019     2.644899 -4151.577552 -4189.060914
20985     1.91199     2.654082 -1156.391383 -1195.527941

[20986 rows x 4 columns]




In [32]:
r2_scores, rmse_scores, mae_scores

([0.6108800244422459,
  0.5940334074944069,
  0.5631737512319985,
  0.983502063575177,
  0.9961070319157642,
  0.9983349217729433],
 [1.2400097090409756,
  1.2665678819259982,
  1.3138254980827275,
  148.91221789992912,
  72.33624170273482,
  47.30780959253169],
 [0.9443831553990804,
  0.9582889612514778,
  0.9963417113187762,
  75.67355126224288,
  28.59861938719397,
  24.27464854855651])