## QASR Challenge with a xLSTM approach


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [9]:
# Load the dataset
data = pd.read_csv('resources/data_train.csv', index_col=0).reset_index(drop=True)
data.head(1)

smiles_list = data['smiles'].tolist()
task_labels = data.iloc[:, 1: ].values

# Vectorize the SMILES strings
def tokenize_smiles(smiles):
    return list(smiles)

smiles_token = [tokenize_smiles(smile) for smile in smiles_list]

# Create a vocabulary to train LSTM with it
vocabulary = {char: idx +1 for idx, char in enumerate(sorted(set(''.join(smiles_list))))}
vocabulary['<PAD>'] = 0

# Convert SMILES to numerical vectors
max_len = max(len(smile) for smile in smiles_token)
X = np.array([[vocabulary.get(char, 0) for char in smile] + [0] * (max_len - len(smile)) for smile in smiles_token])

# Build a Dataset and DataLoader
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    

In [15]:
## Define the LSTM model
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])
        return self.sigmoid(x).squeeze(1)

In [16]:

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X, task_labels, test_size=0.2, random_state=42)
print(f'Train shape X: {x_train.shape}')
print(f'Test shape X: {x_test.shape}')
print(f'Train shape Y: {y_train.shape}')
print(f'Test shape Y: {y_test.shape}')

Train shape X: (9600, 1157)
Test shape X: (2400, 1157)
Train shape Y: (9600, 11)
Test shape Y: (2400, 11)


In [19]:
# Train the model
def train_lstm_for_task(X, y):
    train_dataset = SMILESDataset(X, y)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    model = LSTM(vocab_size=len(vocabulary), embedding_dim=128, hidden_dim=64, output_dim=1)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()

    for epoch in range(10):
        model.train()
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()

    return model


def train_lstm(x_train, x_test, y_train, y_test):
    n_tasks = y_train.shape[1]
    y_hats_probability = np.empty((x_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_probability)
    
    
    for i in tqdm(range(y_train.shape[1]), desc=f'Training for task:'):
        index = ~np.isnan(y_train[:, i]) & (y_train[:, i] != 0)
        y_train[:, i] = (y_train[:, i]+1) // 2 # convert -1 labels to 0 labels. 
        model, roc_auc = train_lstm_for_task(x_train[index], y_train[index, i])
        model.eval()
        with torch.no_grad():
            y_hat = model(torch.tensor(x_test, dtype=torch.long))
            y_hats_probability[:, i] = y_hat.numpy()
            y_hats_class[:, i] = (y_hat > 0.5).numpy()

    return y_hats_probability, y_hats_class

# Train the model
y_hats_probability, y_hats_class = train_lstm(x_train, x_test, y_train, y_test)

Training for task::   0%|          | 0/11 [00:00<?, ?it/s]

Training for task::   0%|          | 0/11 [03:12<?, ?it/s]


KeyboardInterrupt: 