In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, Isomap
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.cluster import DBSCAN

from scipy.special import kl_div
import MDAnalysis as mda
import sys
import itertools
from scipy.special import rel_entr
import matplotlib.pyplot as plt
import glob
import numpy as np
import itertools
import pickle
from scipy.spatial.distance import pdist, cdist
import random

import torch
import torch.nn as nn
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

import copy
import skorch


## Making test data

selection = "name CA"
data_dict = {}

for system in glob.glob("../binding_spots_project/gpcr_sampling/b2ar/b2ar_centered_aligned/*"):
    name = system.split("/")[-1]
    cosmos = mda.Universe(glob.glob(f"{system}/*gro")[0], glob.glob(f"{system}/*xtc"))
    size = cosmos.select_atoms(selection).positions.flatten()
    pos = np.zeros(size.reshape(1, size.shape[0]).shape)
    for ts in cosmos.trajectory:
        pos = np.concatenate((pos, cosmos.select_atoms(selection).positions.reshape(1, pos.shape[1])))
    pos = pos[~np.all(pos == 0, axis=1)]
    data_dict[name] = pos
    
with open('data_dict.pkl', 'wb') as f:
    pickle.dump(data_dict, f)
    

In [2]:
with open('data_dict.pkl', 'rb') as handle:
    data_dict = pickle.load(handle)


In [3]:
system1, system2 = "popc", "chol-site-3"

X = np.concatenate([v for k, v in data_dict.items() if k == system1 or k == system2])
Y = np.concatenate([[-1 for i in range(data_dict[system1].shape[0])], [1 for i in range(data_dict[system2].shape[0])]])


## Autoencoder 

In [30]:

class Autoencoder(nn.Module):
    
    def __init__(self, input_dim=933, encoding_dim=2, hidden_dim=1024, n_hidden=1,
                 drop_rate=0.2, learning_rate=0.001, n_epochs=10, batch_size=32):
        super().__init__()
        
        self.input_dim = input_dim
        self.encoding_dim = encoding_dim
        self.hidden_dim = hidden_dim
        self.n_hidden = n_hidden
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs
        self.n_layers = n_hidden
        self.drop_rate = drop_rate
        self.batch_size = batch_size
        
        self.encoder = [nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Dropout(self.drop_rate)]
        self.decoder = [nn.Linear(encoding_dim, hidden_dim), nn.ReLU(), nn.Dropout(self.drop_rate)]
        
        for _ in range(n_hidden - 1):
            self.encoder = self.encoder + [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(self.drop_rate)]
            self.decoder = self.decoder + [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(), nn.Dropout(self.drop_rate)]
            
        self.encoder = self.encoder + [nn.Linear(hidden_dim, encoding_dim)]
        self.decoder = self.decoder + [nn.Linear(hidden_dim, input_dim)]
        
        self.encoder = nn.Sequential(*self.encoder)
        self.decoder = nn.Sequential(*self.decoder)

        self.criterion = nn.MSELoss()
    

    def forward(self, X, y=None):
        
        X = torch.tensor(X, dtype=torch.float32)
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)

        return encoded, decoded


    def fit(self, X, y=None):
        
        self.train()
        X = torch.tensor(X, dtype=torch.float32)
        indices = [i for i in range(X.shape[0])]
        self.optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        
        for epoch in range(self.n_epochs):
            
            random.shuffle(indices)
            batches = [i for i in range(0, len(indices), self.batch_size)]

            for i in range(len(batches) - 1):
                
                batch_X = X[indices[batches[i]:batches[i+1]]]
                encoded = self.encoder(batch_X)
                decoded = self.decoder(encoded)
                loss = self.criterion(decoded, batch_X)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
            print(f"Epoch {epoch} \t loss {loss}")
            
        return self
    
    def transform(self, X, y=None):
        
        X = torch.tensor(X, dtype=torch.float32)
        encoded = self.encoder(X)
        
        return encoded.detach().numpy()
    
    def inverse_transform(self, X, y=None):
        
        X = torch.tensor(X, dtype=torch.float32)
        decoded = self.decoder(X)
        
        return decoded.detach().numpy()
    
    def score(self, X, y=None):
        
        encoded = self.encoder(X)
        decoded = self.decoder(encoded)
        
        return self.criterion(decoded, X)
    
    def get_params(self):
        return self.parameters
        
    

In [6]:
A = Autoencoder()
print(A.get_params())

<bound method Module.parameters of Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=933, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=2, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=2, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=933, bias=True)
  )
  (criterion): MSELoss()
)>


In [23]:

net = skorch.NeuralNetRegressor(
    module=Autoencoder,
    criterion=nn.MSELoss()
)

"""params = {
            "encoding_dim": [i for i in range(2, 3)],
            "n_hidden": [1],
            "drop_rate": [0.1]
        }


gs = GridSearchCV(net, params, scoring="accuracy")
"""

print(net.get_params)

<bound method NeuralNet.get_params of <class 'skorch.regressor.NeuralNetRegressor'>[uninitialized](
  module=<class '__main__.Autoencoder'>,
)>


In [29]:
a = Autoencoder()
a.forward(X)

(tensor([[-23.4124,  -1.8909],
         [ -3.4667, -13.6230],
         [-12.6900, -23.0109],
         ...,
         [-13.6303, -24.8089],
         [  0.7474, -25.6667],
         [-12.6690, -16.3448]], grad_fn=<AddmmBackward0>),
 tensor([[ 5.7261, -6.7858,  6.5306,  ..., -4.9186, -1.0361,  2.8839],
         [ 1.4169, -2.6755,  2.0896,  ..., -2.3727, -0.3473,  3.4662],
         [ 2.3329, -3.8313,  6.9970,  ..., -8.3758,  7.5888,  3.2690],
         ...,
         [ 4.8825, -5.6612,  3.8970,  ..., -5.0110,  6.9717,  1.4613],
         [ 1.6404,  0.2670, -1.5830,  ..., -0.6934,  0.6092,  3.5549],
         [ 2.5939, -1.5459,  1.4950,  ..., -0.3169,  5.5940,  2.3107]],
        grad_fn=<AddmmBackward0>))

In [31]:
net.fit(X, X)

Re-initializing module.
Re-initializing criterion.
Re-initializing optimizer.


  X = torch.tensor(X, dtype=torch.float32)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x933 and 2x1024)

In [89]:
torch.manual_seed(0)
random.seed(42)
np.random.seed(0)

AE = Autoencoder(input_dim=933, encoding_dim=2, hidden_dim=1024, learning_rate=0.001, n_epochs=20, drop_rate=0.001)


pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('autoencoder', AE)
    ])

pipeline.fit(X)


9437
Epoch 0 	 loss 0.5305214524269104
8329
Epoch 1 	 loss 0.5061935782432556
4020
Epoch 2 	 loss 0.36296579241752625
968
Epoch 3 	 loss 0.3799836039543152
5560
Epoch 4 	 loss 0.37684839963912964
2777
Epoch 5 	 loss 0.3865011930465698
6400
Epoch 6 	 loss 0.3578476309776306
7677
Epoch 7 	 loss 0.40296921133995056
9400
Epoch 8 	 loss 0.389844685792923
8739
Epoch 9 	 loss 0.3926965594291687
5008
Epoch 10 	 loss 0.34247350692749023
4841
Epoch 11 	 loss 0.3560570180416107
7439
Epoch 12 	 loss 0.3429281413555145
1846
Epoch 13 	 loss 0.3621029853820801
3701
Epoch 14 	 loss 0.37056079506874084
4894
Epoch 15 	 loss 0.3108319044113159
5496
Epoch 16 	 loss 0.34832802414894104
5831
Epoch 17 	 loss 0.3463861346244812
5436
Epoch 18 	 loss 0.3102715313434601
2522
Epoch 19 	 loss 0.31523609161376953


In [106]:
def get_original_space_error(pipeline, X):
    
    scaler = pipeline["scaler"]
    autoencoder = pipeline["autoencoder"]
    X_scaled = scaler.transform(X)
    encoded = autoencoder.transform(X_scaled)
    decoded = autoencoder.inverse_transform(encoded)
    
    original_space_decoded = torch.Tensor(scaler.inverse_transform(decoded))

    original_space_error = autoencoder.criterion(X, original_space_decoded)
    
    return original_space_error

    
print(get_original_space_error(pipeline, torch.Tensor(X)))

tensor(3.1090)


In [107]:
device = ('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} device.")

Using cuda device.


## FLOW

In [108]:
n_comps_dim_reduc = [i for i in range(2, 7)] if X.shape[1] > 5 else [i for i in range(2, X.shape[1])]
n_comps_cluster = [i for i in range(2, 3)]

In [109]:

default_steps = {
    "dim_reducer": {
        #PCA(): {
        #    "n_components": n_comps_dim_reduc
        #},
        Autoencoder(input_dim=X.shape[1]): {
            "encoding_dim": [i for i in range(2, 3)],
            "n_hidden": [1],
            "drop_rate": [0.1]
        }
    },
    "classifier": {
        GaussianMixture(): {
            "n_components": n_comps_cluster,
            "covariance_type": ["full", "spherical", "diag", "tied"]
        }
    }
}


In [110]:
def build_pipelines(step_grid, X):
    
    reducers = step_grid["dim_reducer"]
    classifiers = step_grid["classifier"]
    combinations = list(itertools.product(reducers.keys(), classifiers.keys()))
    pipelines = []
    
    for comb in combinations:
        print(f"fitting combination: {comb}")
        grid = {}
        for param, values in step_grid["dim_reducer"][comb[0]].items():
            key = f"dim_reducer__{param}"
            grid[key] = values
        for param, values in step_grid["classifier"][comb[1]].items():
            key = f"classifier__{param}"
            grid[key] = values
        print(f"with params {grid}")
        pipe = Pipeline(steps=[("dim_reducer", comb[0]), ("classifier", comb[1])])
        cv = GridSearchCV(pipe, param_grid=grid).fit(X)
        pipelines.append(cv.best_estimator_)

    return pipelines



In [111]:

def caluclate_KL(pipeline, X, Y):
    
    preds = pipeline.predict(X)
    cluster_populations = []
    for system in set(Y.flatten()):
        system_preds = preds[np.where(Y == system)[0]]
        populations = [system_preds[np.where(system_preds == i)].shape[0] / system_preds.shape[0] for i in set(preds)]
        cluster_populations.append(populations)
        
    return sum(rel_entr(cluster_populations[0], cluster_populations[1])) # only two systems atm
    


In [112]:

def flow(X, Y, step_grid):
    
    best_KL = 0
    
    pipelines = build_pipelines(step_grid=step_grid, X=X)
    best_pipe = pipelines[0]
    for pipe in pipelines:
        KL = caluclate_KL(pipe, X, Y)
        if KL > best_KL:
            best_KL = KL
            best_pipe = pipe
    
    return best_pipe, best_KL    
        


In [113]:
bp, bkl = flow(X, Y, default_steps)

fitting combination: (Autoencoder(
  (encoder): Sequential(
    (0): Linear(in_features=933, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=2, bias=True)
  )
  (decoder): Sequential(
    (0): Linear(in_features=2, out_features=1024, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=1024, out_features=933, bias=True)
  )
  (criterion): MSELoss()
), GaussianMixture())
with params {'dim_reducer__encoding_dim': [2], 'dim_reducer__n_hidden': [1], 'dim_reducer__drop_rate': [0.1], 'classifier__n_components': [2], 'classifier__covariance_type': ['full', 'spherical', 'diag', 'tied']}


AttributeError: 'Autoencoder' object has no attribute 'set_params'

In [121]:
params = {
            "encoding_dim": [i for i in range(2, 3)],
            "n_hidden": [1],
            "drop_rate": [0.1]
        }
AE = Autoencoder(input_dim=X.shape[1])
m = GridSearchCV(AE, params)
m.fit(X)


TypeError: Autoencoder.get_params() got an unexpected keyword argument 'deep'

In [130]:
print(m.score(X))

4207333.237055468


In [111]:
"""data = np.load("/wrk/eurastof/somethingwithflow/drors_for_all.npy")

systems = [2,7]

X = data[(data[:,4] == systems[0]) | (data[:,4] == systems[1])]
Y = X[:,4]
X = X[:,:-1]"""


'data = np.load("/wrk/eurastof/somethingwithflow/drors_for_all.npy")\n\nsystems = [2,7]\n\nX = data[(data[:,4] == systems[0]) | (data[:,4] == systems[1])]\nY = X[:,4]\nX = X[:,:-1]'

In [132]:
AE = Autoencoder(input_dim=X.shape[1], encoding_dim=2, hidden_dim=1024, n_hidden=1, n_epochs=20)
AE.fit(X)

In [137]:
enc = AE.transform(X)
dec = AE.inverse_transform(enc)

print(dec)

[[ 0.       57.78432  79.060776 ...  0.        0.       43.51919 ]
 [ 0.       51.657375 91.10346  ...  0.        0.       60.530907]
 [ 0.       48.699955 88.55174  ...  0.        0.       63.272804]
 ...
 [ 0.       40.675125 81.28754  ...  0.        0.       56.257988]
 [ 0.       54.943604 95.27075  ...  0.        0.       49.25358 ]
 [ 0.       42.19254  72.70722  ...  0.        0.       60.36032 ]]


feature selection
    
random seeds


