In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from psmiles import PolymerSmiles as PS
from sklearn.metrics  import  mean_squared_error
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F 
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
from optuna.trial import TrialState 

random_seed = 123
torch.manual_seed(random_seed)
np.random.seed(random_seed)

DEVICE = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_pickle("../data/updated_polymers.pth")

In [None]:
scalar = MinMaxScaler()
data = df["fingerprint_polyBERT"]
target = df["Egc"]

#data = data.values.reshape(-1, 1)  # Reshape data

data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=123)

# Scaling target variable
target_train = scalar.fit_transform(target_train.values.reshape(-1, 1))
target_test = scalar.transform(target_test.values.reshape(-1, 1))

# Creating tensors from data

#Training Data
data_train_tensor = torch.tensor(data_train.reset_index(drop = True), dtype=torch.float32)
target_train_tensor = torch.tensor(target_train, dtype=torch.float32)

train_dataset = TensorDataset(data_train_tensor, target_train_tensor)
train_loader = DataLoader(train_dataset, batch_size= 32, shuffle= True)

#Testing Data

data_test_tensor = torch.tensor(data_test.reset_index(drop= True), dtype=torch.float32)
target_test_tensor = torch.tensor(target_test, dtype=torch.float32)

test_dataset = TensorDataset(data_test_tensor, target_test_tensor)
test_loader = DataLoader(test_dataset, shuffle= False)

In [None]:
def define_model(trial):
    layers =  nn.ModuleList([
            nn.Sequential(
                nn.Linear(600, 1504),
                nn.Dropout(0.122517721),
                nn.PReLU()
            ),
            nn.Sequential(
                nn.Linear(1504, 1760),
                nn.Dropout(0.125659318),
                nn.PReLU()
            ),
            nn.Sequential(
                nn.Linear(1760, 736),
                nn.Dropout(0.125674157),
                nn.PReLU()
            ),
            
            nn.Linear(736, 1)
        ])

    return nn.Sequential(*layers)

In [None]:
def objective(trial):

    # Generate the model.
    
    model = define_model(trial)
    state_dict = torch.load('../models/molecule_circular.pth')
    model.load_state_dict(state_dict, strict=False)
    model = model.to(DEVICE)

    # Generate the optimizers.
    lr = trial.suggest_float("lr", 1e-5, 0.00020108, log = True)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    EPOCHS  = trial.suggest_int("EPOCHS", 50, 700)

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        for batch_idx,(data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.view(-1).to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output.view(-1), target)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        epoch_loss = running_loss / batch_idx

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(test_loader):
                # Limiting validation data.

                data, target = data.to(DEVICE), target.to(DEVICE)
                pred = model(data)
                target_scaled = scalar.inverse_transform(target.cpu().numpy())
                pred_scaled = scalar.inverse_transform(pred.cpu().detach().numpy())

                test_loss = mean_squared_error(target_scaled, pred_scaled)
                val_loss  += test_loss

        avg_val_loss = val_loss/ batch_idx 

        trial.report(avg_val_loss, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return avg_val_loss

In [None]:
def objective(trial):

    # Generate the model.
    
    model = define_model(trial)
    state_dict = torch.load('../models/molecule_circular.pth')
    model.load_state_dict(state_dict, strict=False)
    model = model.to(DEVICE)

    for name, param in model.named_parameters():
        if '0.0' in name or '0.2' in name or '1.0' in name or '1.2' in name:
            param.requires_grad = False
            
    # Generate the optimizers.
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log = True)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    EPOCHS  = trial.suggest_int("EPOCHS", 50, 700)

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        for batch_idx,(data, target) in enumerate(train_loader):
            data, target = data.to(DEVICE), target.view(-1).to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = F.mse_loss(output.view(-1), target)
            running_loss += loss.item()
            loss.backward()
            optimizer.step()
        
        epoch_loss = running_loss / batch_idx

        # Validation of the model.
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(test_loader):
                # Limiting validation data.

                data, target = data.to(DEVICE), target.to(DEVICE)
                pred = model(data)
                target_scaled = scalar.inverse_transform(target.cpu().numpy())
                pred_scaled = scalar.inverse_transform(pred.cpu().detach().numpy())

                test_loss = mean_squared_error(target_scaled, pred_scaled)
                val_loss  += test_loss

        avg_val_loss = val_loss/ batch_idx 

        trial.report(avg_val_loss, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return avg_val_loss

In [None]:
if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials = 12000, n_jobs= 5)

    pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
    complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    # Save results to csv file
    df = study.trials_dataframe().drop(['datetime_start', 'datetime_complete', 'duration'], axis=1)  # Exclude columns
    df = df.loc[df['state'] == 'COMPLETE']        # Keep only results that did not prune
    df = df.drop('state', axis=1)                 # Exclude state column
    df = df.sort_values('value')                  # Sort based on accuracy
    df.to_csv('op_2_freeze.csv', index=False)  # Save to csv file

    # Display results in a dataframe
    print("\nOverall Results (ordered by accuracy):\n {}".format(df))

    # Find the most important hyperparameters
    most_important_parameters = optuna.importance.get_param_importances(study, target=None)

    # Display the most important hyperparameters
    print('\nMost important hyperparameters:')
    for key, value in most_important_parameters.items():
        print('  {}:{}{:.2f}%'.format(key, (15-len(key))*' ', value*100))