In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline

In [2]:
import multiprocessing as mp
max_cpus = mp.cpu_count()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("public/pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("public/pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("public/train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("public/train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("public/test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [4]:
# Load data
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()

In [5]:
# Scale Lumo and gap labels
scaler_pretrain = StandardScaler()
y_pretrain_scaled = scaler_pretrain.fit_transform(y_pretrain.reshape(-1, 1)).flatten()

scaler_train = StandardScaler()
y_train_scaled = scaler_train.fit_transform(y_train.reshape(-1, 1)).flatten()

In [6]:
class AutoEncoder(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self, input_dim, latent_dim):
        """
        The constructor of the model.
        """
        super().__init__()
        # defining the architecture of the model.

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 750),
            nn.ReLU(),
            nn.Linear(750, latent_dim),
        )

# 1000 -> 750 -> 500 achieves 0.0483 reconstruction error after 30 epochs
# 0.0442 after 40 epochs
# 0.0569 after 20 epochs

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 750),
            nn.ReLU(),
            nn.Linear(750, input_dim),
            nn.Sigmoid()
        )

        self.predictor = nn.Linear(latent_dim, 1)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # implementation of the forward pass of the model.

        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        predicted = self.predictor(encoded)
        
        return encoded, decoded, predicted

In [7]:
def RMSELoss(y_pred,y):
    return torch.sqrt(torch.mean((y_pred-y)**2))

In [8]:
def make_feature_extractor(x, y, batch_size=256, eval_size=1000):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set
            
    output: make_features: function, a function which can be used to extract features from the training and test data
    """

    # Pretraining data loading
    x_tr, x_val, y_tr, y_val  = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val, y_tr, y_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float), torch.tensor(y_tr, dtype=torch.float),  torch.tensor(y_val, dtype=torch.float)

    train_dataset = TensorDataset(x_tr, y_tr)
    val_dataset = TensorDataset(x_val, y_val)

    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)


    # model declaration
    model = AutoEncoder(1000, 32)
    model.train()
    model.to(device)
    
    # Implementation of the training loop.

    n_epochs = 20 # probably does not make big difference, but could consider a bit more to reduce reconstruction error
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

    for epoch in range(n_epochs):        
        for [X, Y] in train_dataloader:
            X, Y = X.to(device), Y.to(device)
            optimizer.zero_grad()
            _, reconstruct, prediction = model(X)

            loss_1 = RMSELoss(reconstruct, X)
            loss_2 = RMSELoss(prediction, Y) 
            #combinatorial loss to keep reconstruction and prediction accurate
            loss = loss_1 + loss_2
            loss.backward()
            optimizer.step()

        

        # Validation loop
        model.eval()
        val_loss_1 = 0.0
        val_loss_2 = 0.0

        with torch.no_grad():
            for [X_val, Y_val] in val_dataloader:
                X_val, Y_val = X_val.to(device), Y_val.to(device)
                _, reconstruct_val, prediction_val = model(X_val)
            
                loss_val_1 = RMSELoss(reconstruct_val, X_val)
                loss_val_2 = RMSELoss(prediction_val, Y_val)

                val_loss_1 += loss_val_1.item()
                val_loss_2 += loss_val_2.item()

        val_loss_1 /= len(val_dataloader)
        val_loss_2 /= len(val_dataloader)
        val_loss = val_loss_1 + val_loss_2


        print(f"Epoch {epoch+1}/{n_epochs}, Validation Loss Reconstruction: {val_loss_1:.4f}, Validation Loss Prediction: {val_loss_2:.4f}, Validation Loss Total: {val_loss:.4f}")

        # Switch back to train mode
        model.train()
    
    def make_features(x):
        """
        This function extracts features from the training and test data, used in the actual pipeline 
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        model.eval()
        # implementation of the feature extraction, a part of a pretrained model used later in the pipeline.
        with torch.no_grad():
        
            features, _, _ = model(x)
            features = features.numpy() 
        return features, model

    return make_features


In [9]:
feature_extractor = make_feature_extractor(x_pretrain, y_pretrain_scaled)

Epoch 1/20, Validation Loss Reconstruction: 0.1845, Validation Loss Prediction: 0.9869, Validation Loss Total: 1.1714
Epoch 2/20, Validation Loss Reconstruction: 0.1828, Validation Loss Prediction: 0.9952, Validation Loss Total: 1.1780
Epoch 3/20, Validation Loss Reconstruction: 0.1761, Validation Loss Prediction: 0.9876, Validation Loss Total: 1.1637
Epoch 4/20, Validation Loss Reconstruction: 0.1658, Validation Loss Prediction: 0.9990, Validation Loss Total: 1.1647
Epoch 5/20, Validation Loss Reconstruction: 0.1543, Validation Loss Prediction: 0.9907, Validation Loss Total: 1.1450
Epoch 6/20, Validation Loss Reconstruction: 0.1429, Validation Loss Prediction: 0.9889, Validation Loss Total: 1.1318
Epoch 7/20, Validation Loss Reconstruction: 0.1338, Validation Loss Prediction: 0.9872, Validation Loss Total: 1.1211
Epoch 8/20, Validation Loss Reconstruction: 0.1257, Validation Loss Prediction: 0.9887, Validation Loss Total: 1.1143
Epoch 9/20, Validation Loss Reconstruction: 0.1189, Vali

In [None]:
x_tr = torch.tensor(x_train, dtype=torch.float)
x_ptr = torch.tensor(x_pretrain, dtype=torch.float)
x_tst = torch.tensor(x_test.to_numpy(), dtype=torch.float)

In [None]:
# Train linear regression on pretrain
pretrained_features, _ = feature_extractor(x_ptr)
pretrain_ridge = Ridge()
pretrain_ridge.fit(pretrained_features, y_pretrain_scaled)
pretrain_weights = pretrain_ridge.coef_

In [13]:
# Trying to predict the train data using ridge trained on pretrain just to get an estimate

trained_features, _ = feature_extractor(x_tr)
y_train_predicted = pretrain_ridge.predict(trained_features)
y_train_predicted = scaler_train.inverse_transform(y_train_predicted.reshape(-1, 1)).flatten()

np.sqrt(np.mean((y_train - y_train_predicted)**2))

In [17]:
# Train linear regression on train using weights from pretrain
trained_features, _ = feature_extractor(x_tr)
train_ridge = Ridge()
train_ridge.coef_ = pretrain_weights
train_ridge.fit(trained_features, y_train_scaled)

In [18]:
# Predict and save
test_features, _ = feature_extractor(x_tst)
y_pred = train_ridge.predict(test_features)
y_pred_original = scaler_train.inverse_transform(y_pred.reshape(-1, 1)).flatten()

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred_original}, index=x_test.index)
y_pred.to_csv("results.csv", index_label="Id")