In [2]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from psmiles import PolymerSmiles as PS
from sklearn.metrics  import  mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F 
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
from optuna.trial import TrialState 

random_seed = 123
torch.manual_seed(random_seed)
np.random.seed(random_seed)

DEVICE = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [4]:
df = pd.read_pickle("../data/updated_polymers.pth")

In [5]:
scalar = MinMaxScaler()
data = df["fingerprint_polyBERT"]
target = df["Egc"]

#data = data.values.reshape(-1, 1)  # Reshape data

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=123)

# Scaling target variable
target_train = scalar.fit_transform(target_train.values.reshape(-1, 1))
target_test = scalar.transform(target_test.values.reshape(-1, 1))

# Creating tensors from data

#Training Data
data_train_tensor = torch.tensor(data_train.reset_index(drop = True), dtype=torch.float32)
target_train_tensor = torch.tensor(target_train, dtype=torch.float32)

train_dataset = TensorDataset(data_train_tensor, target_train_tensor)
train_loader = DataLoader(train_dataset, batch_size= 32, shuffle= True)

#Testing Data

data_test_tensor = torch.tensor(data_test.reset_index(drop= True), dtype=torch.float32)
target_test_tensor = torch.tensor(target_test, dtype=torch.float32)

test_dataset = TensorDataset(data_test_tensor, target_test_tensor)
test_loader = DataLoader(test_dataset, shuffle= False)

  data_train_tensor = torch.tensor(data_train.reset_index(drop = True), dtype=torch.float32)


In [6]:
def net():
    layers =  nn.ModuleList([
           nn.Sequential(
                nn.Linear(600, 1504),
                nn.Dropout(0.122517721),
                nn.PReLU()
            ),
            nn.Sequential(
                nn.Linear(1504, 1760),
                nn.Dropout(0.125659318),
                nn.PReLU()
            ),
            nn.Sequential(
                nn.Linear(1760, 736),
                nn.Dropout(0.125674157),
                nn.PReLU()
            )
        ])

    return nn.Sequential(*layers)


In [7]:
state_dict = torch.load('../models/molecule_polyBERT.pth')
state_dict.pop("my_layers.3.weight")
state_dict.pop("my_layers.3.bias")

OrderedDict([('my_layers.0.0.weight', tensor([[-0.0063, -0.0108,  0.1236,  ...,  0.0241,  0.1130,  0.0365],
        [ 0.0888, -0.0701,  0.1512,  ..., -0.0178,  0.0828, -0.0632],
        [ 0.0075,  0.0749, -0.1049,  ...,  0.0371,  0.0677, -0.0243],
        ...,
        [ 0.0657,  0.0548,  0.0581,  ...,  0.0889,  0.0037,  0.0107],
        [-0.1599, -0.0155, -0.0027,  ...,  0.0125, -0.0211, -0.1373],
        [-0.2703, -0.0034, -0.0578,  ...,  0.0481,  0.0143, -0.0188]],
       device='cuda:3')), ('my_layers.0.0.bias', tensor([-0.0520, -0.0220, -0.0389,  ..., -0.0527, -0.0120, -0.0783],
       device='cuda:3')), ('my_layers.0.2.weight', tensor([0.0144], device='cuda:3')), ('my_layers.1.0.weight', tensor([[ 0.0419, -0.0316,  0.1388,  ..., -0.0146, -0.1075, -0.0230],
        [-0.0773, -0.0028,  0.0360,  ...,  0.0285, -0.1143, -0.0702],
        [-0.2174,  0.0222, -0.0673,  ..., -0.0390, -0.0597, -0.0100],
        ...,
        [-0.2156, -0.0982, -0.0056,  ..., -0.0293, -0.0807, -0.0047],
     

In [None]:
def define_model(trial):
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = net()
    layers.load_state_dict(state_dict, strict=False)


    for name, param in layers.named_parameters():
        if '0.0' in name or '0.2' in name or '1.0' in name or '1.2' in name or '2.0' in name or '2.2' in name:
            param.requires_grad = False

    in_features = 736
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 352, 2144, step=64, log = False)
        p = trial.suggest_float("dropout_l{}".format(i), 0.1, 0.5, log = True)

        new_step =[
            nn.Linear(in_features, out_features),
            nn.Dropout(p),
            nn.PReLU()
            ]
        layers.append(nn.Sequential(*new_step))
       
        in_features = out_features
    layers.append(nn.Linear(in_features, 1))

    return nn.Sequential(*layers)

In [None]:
def objective(trial):

    # Generate the model.
    
    model = define_model(trial)    
    model.to(DEVICE)
  
    # Generate the optimizers.
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log = True)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    EPOCHS  = trial.suggest_int("EPOCHS", 50, 700)

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        for batch_idx,(data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.view(-1).to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output.view(-1), target)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        epoch_loss = running_loss / batch_idx

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(test_loader):
                # Limiting validation data.

                data, target = data.to(DEVICE), target.to(DEVICE)
                pred = model(data)
                target_scaled = scalar.inverse_transform(target.cpu().numpy())
                pred_scaled = scalar.inverse_transform(pred.cpu().detach().numpy())

                test_loss = mean_squared_error(target_scaled, pred_scaled)
                val_loss  += test_loss

        avg_val_loss = val_loss/ batch_idx 

        trial.report(avg_val_loss, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return avg_val_loss

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials = 25000, n_jobs= 10)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Save results to csv file
    df = study.trials_dataframe().drop(['datetime_start', 'datetime_complete', 'duration'], axis=1)  # Exclude columns
    df = df.loc[df['state'] == 'COMPLETE']        # Keep only results that did not prune
    df = df.drop('state', axis=1)                 # Exclude state column
    df = df.sort_values('value')                  # Sort based on accuracy
    df.to_csv('op_add.csv', index=False)  # Save to csv file

    # Display results in a dataframe
    print("\nOverall Results (ordered by accuracy):\n {}".format(df))

    # Find the most important hyperparameters
    most_important_parameters = optuna.importance.get_param_importances(study, target=None)

    # Display the most important hyperparameters
    print('\nMost important hyperparameters:')
    for key, value in most_important_parameters.items():
        print('  {}:{}{:.2f}%'.format(key, (15-len(key))*' ', value*100))