In [62]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import pandas as pd

from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

import os
import datetime
import json

In [9]:
df = pd.read_csv('../data/processed_MOFs.csv')
df.head()

Unnamed: 0,MOFname,CO2_uptake_P0.15bar_T298K [mmol/g],volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],largest_free_sphere_diameter [A],largest_included_sphere_diameter [A],metal_linker
0,str_m5_o16_o16_sra_sym.77,5.955197,2473.186302,1493.01184,613.3,0.14835,0.148,4.6137,4.6137,9
1,str_m5_o16_o16_sra_sym.37,5.715251,2419.885159,1444.7968,0.0,0.16099,0.1624,4.1021,4.10055,9
2,str_m5_o13_o18_sra_sym.149,5.524486,2514.627698,1504.68312,0.0,0.1798,0.181,4.24711,4.24603,9
3,str_m5_o2_o18_sra_sym.4,5.517258,2128.61292,1424.81024,0.0,0.10245,0.0922,4.11419,3.63925,9
4,str_m5_o16_o16_sra_sym.31,5.451593,2415.251225,1436.90552,1347.04,0.15634,0.1583,4.37077,4.16451,9


In [13]:
df = df.drop(columns=['MOFname'])

In [42]:
# Sample 10% of the data
df_sample = df.sample(frac=1, random_state=42)

y = df_sample['CO2_uptake_P0.15bar_T298K [mmol/g]']
X_df = df_sample.drop(columns=['CO2_uptake_P0.15bar_T298K [mmol/g]'])

# Initialize the StandardScaler
scaler = StandardScaler()

# Apply the scaler to the numerical columns
X = scaler.fit_transform(X_df)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [43]:
# Convert DataFrames to tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

In [53]:
input_size = X.shape[1]

'''
    Multilayer Perceptron
'''
class MLP(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64), 
            nn.ReLU(),
            nn.Linear(64, 32), 
            nn.ReLU(), 
            nn.Linear(32, 1)
        )

    def forward(self, x):
        '''Forward pass'''
        return self.layers(x)
    
    def __str__(self):
        '''String representation of the model'''
        model_str = "Multilayer Perceptron\n"
        model_str += "Input size: {}\n".format(input_size)
        for layer in self.layers:
            model_str += "{}\n".format(layer)
        return model_str

Multilayer Perceptron
Input size: 8
Linear(in_features=8, out_features=64, bias=True)
ReLU()
Linear(in_features=64, out_features=32, bias=True)
ReLU()
Linear(in_features=32, out_features=1, bias=True)



In [64]:
# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def save_report(data, date):
    try:
        os.mkdir('../reports')
    except:
        pass

    filename = f'../report/report_{date}.json'
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

# Hyperparameters
batch_size = 32
learning_rate = 1e-4
epochs = 10
patience = 2  # Number of epochs to wait for improvement

# Create dataset and dataloader
dataset = CustomDataset(X_tensor, y_tensor)
trainloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create datasets and dataloaders
train_dataset = CustomDataset(X_train, y_train)
test_dataset = CustomDataset(X_test, y_test)
trainloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
testloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# set fixed random seed
torch.manual_seed(42)

# Initializing Net
mlp = MLP()

# defining loss function
loss_function = nn.MSELoss()
# defining optimizer
optimizer = torch.optim.Adam(mlp.parameters(), lr=learning_rate)

# Early stopping variables
epochs_no_improve = 0

training_loss = 0.0
best_test_loss = 0.0
best_epoch = 0.0

# training loop
for epoch in range(epochs):
    
    print('Epoch %s' % (epoch+1))

    current_loss = 0.0
    

    # iterate over the data
    for i, data in enumerate(trainloader, 0):
        
        # get data and ground truth
        inputs, targets = data

        # set gradients of all optimized tensors to zero
        optimizer.zero_grad()

        # forward pass of data through net
        outputs = mlp(inputs)

        # compute loss
        loss = loss_function(outputs, targets)

        # backward pass
        loss.backward()

        # optimizing parameters
        optimizer.step()

        # show stats
        current_loss += loss.item()

    mean_loss = current_loss / len(trainloader)
    # Testing phase
    mlp.eval()
    test_loss = 0.0
    with torch.no_grad():
        for i, data in enumerate(testloader, 0):
            inputs, targets = data
            outputs = mlp(inputs)
            loss = loss_function(outputs, targets)
            test_loss += loss.item()
    
    mean_test_loss = test_loss / len(testloader)

    # Check for early stopping
    if mean_test_loss < best_test_loss or epoch == 0:
        best_test_loss = mean_test_loss
        best_epoch = epoch
        training_loss = mean_loss
    
    print('Training loss: %.3f, Testing loss: %.3f' % (mean_loss, mean_test_loss))


date = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
data = {
        'epochs': best_epoch,
        'training_loss': training_loss,
        'testing_loss': best_test_loss,
        'date': date,
        'architecture': str(mlp).split('\n')
}

save_report(data, date)

# End
print('Training finished!')

Epoch 1
Training loss: 0.193, Testing loss: 0.164
Epoch 2
Training loss: 0.153, Testing loss: 0.139
Epoch 3
Training loss: 0.134, Testing loss: 0.125
Epoch 4
Training loss: 0.123, Testing loss: 0.118
Epoch 5
Training loss: 0.117, Testing loss: 0.112
Epoch 6
Training loss: 0.112, Testing loss: 0.108
Epoch 7
Training loss: 0.110, Testing loss: 0.106
Epoch 8
Training loss: 0.108, Testing loss: 0.105
Epoch 9
Training loss: 0.106, Testing loss: 0.103
Epoch 10
Training loss: 0.105, Testing loss: 0.101
Training finished!
