In [46]:
import MDAnalysis as mda
import glob
import re
import numpy as np
import matplotlib.pyplot as plt

## Making coordinate data (common atoms)

b2ar_traj_path = "/wrk/eurastof/binding_spots_project/gpcr_sampling/b2ar/b2ar_centered_aligned/"
b2ar_common_ndx = "/wrk/eurastof/binding_spots_project/HFSP---Lipid-binding-states/calculations/b2ar_common.ndx"

with open(b2ar_common_ndx) as f:
    lines = "".join(f.readlines())

resids = " ".join(re.findall(r"\d+", lines)[1:])


dirs = glob.glob(f"{b2ar_traj_path}*")
coordinates = []

for d in dirs:

    gro = glob.glob(f"{d}/*gro")[0]
    xtcs = glob.glob(f"{d}/*xtc")
    cosmos = mda.Universe(gro, xtcs)
    common_ca = cosmos.select_atoms(f"bynum {resids}")

    for ts in cosmos.trajectory[0:-1:5]:
        coords = common_ca.positions.flatten()
        coordinates.append(coords.reshape(1, coords.shape[0]))

X = np.concatenate(coordinates)
np.save("./b2ar_common_ca_coordinates.npy", X)

## Autoencoder pipeline

In [131]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


import torch
from torch import nn, optim

import random


In [146]:

class Autoencoder(BaseEstimator, TransformerMixin, nn.Module):

    def __init__(self, in_shape=10, enc_shape=2, middle_shape=5, n_hidden=1, loss_fn=nn.L1Loss(), lr=1e-3):
        
        super().__init__()
        self.loss_fn = loss_fn
        self.lr = lr
        self.n_hidden = n_hidden
        self.in_shape = in_shape
        self.enc_shape = enc_shape
        self.middle_shape = middle_shape
        
        encoder_layers = [nn.Linear(self.in_shape, self.middle_shape), nn.ReLU(), nn.Dropout(0.2)]
        decoder_layers = [nn.Linear(self.enc_shape, self.middle_shape), nn.ReLU(), nn.Dropout(0.2)]

        for i in range(n_hidden - 1):
            encoder_layers.append(nn.Linear(self.middle_shape, self.middle_shape))
            encoder_layers.append(nn.ReLU())
            encoder_layers.append(nn.Dropout(0.2))
            decoder_layers.append(nn.Linear(self.middle_shape, self.middle_shape))
            decoder_layers.append(nn.ReLU())
            decoder_layers.append(nn.Dropout(0.2))
            
        encoder_layers.append(nn.Linear(self.middle_shape, self.enc_shape))
        decoder_layers.append(nn.Linear(self.middle_shape, self.in_shape))
        decoder_layers.append(nn.Sigmoid())

        self.encode = nn.Sequential(*encoder_layers)
        self.decode = nn.Sequential(*decoder_layers)
        

    def fit(self, X, y=None, n_epochs=20, batch_size=32, verbose=False):

        self.training = True
        X = torch.Tensor(X)
        indices = [i for i in range(X.shape[0])]
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        
        for epoch in range(n_epochs):
        
            random.shuffle(indices)
            batches = [i for i in range(0, len(indices), batch_size)]

            for i in range(len(batches) - 1):
                
                batch_X = X[indices[batches[i]:batches[i+1]]]
                self.optimizer.zero_grad()
                
                encoded = self.encode(batch_X)
                decoded = self.decode(encoded)

                loss = self.loss_fn(decoded, batch_X)
                loss.backward()
                self.optimizer.step()
            
            if verbose:
                print(f'epoch {epoch} \t Loss: {loss.item():.4g}')
        
        return self

    def transform(self, X, y=None):
        encoded = self.encode(torch.Tensor(X))
        return encoded.detach().numpy()
    

    def inverse_transform(self, X, y=None):
        decoded = self.decode(torch.Tensor(X))
        return decoded.detach().numpy()
    
    def score(self, X, y=None):
        encoded = self.transform(X)
        decoded = self.inverse_transform(encoded)
        return self.loss_fn(torch.Tensor(X), torch.Tensor(decoded))
        
    
    



In [138]:
X = np.load("./b2ar_common_ca_coordinates.npy")
print(X.shape)

pipe = Pipeline(
    steps=[
        ("Scaler", StandardScaler()),
        ("Autoencoder", Autoencoder(in_shape=X.shape[1], middle_shape=1024, enc_shape=2, n_hidden=2)),
    ]
)


(7900, 786)


In [140]:
gcv = GridSearchCV(pipe, param_grid={"Autoencoder__middle_shape": [1024, 2048]}, verbose=10).fit(X)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START Autoencoder__middle_shape=1024..............................
epoch 0 	 Loss: 0.7655
epoch 1 	 Loss: 0.7266
epoch 2 	 Loss: 0.6884
[CV 1/5; 1/2] END Autoencoder__middle_shape=1024;, score=0.881 total time=   6.6s
[CV 2/5; 1/2] START Autoencoder__middle_shape=1024..............................
epoch 0 	 Loss: 0.7473
epoch 1 	 Loss: 0.6599
epoch 2 	 Loss: 0.7474
[CV 2/5; 1/2] END Autoencoder__middle_shape=1024;, score=0.690 total time=   6.5s
[CV 3/5; 1/2] START Autoencoder__middle_shape=1024..............................
epoch 0 	 Loss: 0.732
epoch 1 	 Loss: 0.7131
epoch 2 	 Loss: 0.8113
[CV 3/5; 1/2] END Autoencoder__middle_shape=1024;, score=0.681 total time=   6.2s
[CV 4/5; 1/2] START Autoencoder__middle_shape=1024..............................
epoch 0 	 Loss: 0.6927
epoch 1 	 Loss: 0.7394
epoch 2 	 Loss: 0.7448
[CV 4/5; 1/2] END Autoencoder__middle_shape=1024;, score=0.853 total time=   6.3s
[CV 5/5; 1/2]

In [142]:
print(gcv.best_estimator_)

Pipeline(steps=[('Scaler', StandardScaler()),
                ('Autoencoder',
                 Autoencoder(enc_shape=2, in_shape=786, middle_shape=1024,
                             n_hidden=2))])


In [143]:
param_grid = {
    "Autoencoder__middle_shape": [512, 1024, 2048],
    "Autoencoder__enc_shape": [2, 3, 4],
    "Autoencoder__n_hidden": [1, 2, 3, 4]
}

In [145]:
def pipeline(transformer, param_grid, X):
    
    pipe = Pipeline(
        steps=[
            ("Scaler", StandardScaler()),
            ("Autoencoder", transformer),
        ]   
    )
    
    gridsearch = GridSearchCV(pipe, param_grid=param_grid, verbose=10)
    gridsearch.fit(X)
    
    return gridsearch.best_estimator_
    
    
    

In [148]:

best = pipeline(Autoencoder(in_shape=X.shape[1]), param_grid, X)



Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5; 1/36] START Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1
[CV 1/5; 1/36] END Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1;, score=0.899 total time=   2.5s
[CV 2/5; 1/36] START Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1
[CV 2/5; 1/36] END Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1;, score=0.725 total time=   2.4s
[CV 3/5; 1/36] START Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1
[CV 3/5; 1/36] END Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1;, score=0.698 total time=   2.4s
[CV 4/5; 1/36] START Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1
[CV 4/5; 1/36] END Autoencoder__enc_shape=2, Autoencoder__middle_shape=512, Autoencoder__n_hidden=1;, score=0.878 total time=   2.4

KeyboardInterrupt: 