# Analysis
In this notebook I analyze all the data collected through the iterations for the reinforcement TICA 

### Import needed modules and useful functions

In [None]:
#-- useful python script for training the DeepTICA cvs --#
from utils import *

#-- to not visualize warnings --#
import warnings
warnings.filterwarnings('ignore')

### Simulation parameters 

In [None]:
kb=0.008314
#-- SIMULATION PARAMETERS --#
sim_parameters = {
    'temp':340, 
    'beta': 1./(340*kb),
    'kbt': None,
    #-- parameters to compute the fes --#
    'blocks':2,
    'bandwidth': 0.02,
    'plot_max_fes' :10,
}
#--------------------------------------#


# Results

In [None]:
set = "long_unbias/"

In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,6))
data = load_dataframe(set+"COLVAR") 
descriptors_names = data.filter(regex='^dd_[^a-z]').columns.values
#rmsd_ca end hbonds
data.plot.scatter(y="rmsd_ca",x="end",ax=ax)

ax.grid()
ax.set_title("long simulation")
ax.set_ylabel("rmsd_ca")
ax.set_xlabel("end")
#fig.savefig(root+"images/traj_bias"+str(k)+".png",dpi=300,facecolor="white",transparent=False)
plt.tight_layout()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,6))
data = load_dataframe(set+"COLVAR") 
descriptors_names = data.filter(regex='^dd_[^a-z]').columns.values
#rmsd_ca end hbonds
data.plot.scatter(y="hbonds",x="end",ax=ax)

ax.grid()
ax.set_title("long simulation")
ax.set_ylabel("hbonds")
ax.set_xlabel("end")
#fig.savefig(root+"images/traj_bias"+str(k)+".png",dpi=300,facecolor="white",transparent=False)
plt.tight_layout()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(6,6))
data = load_dataframe(set+"COLVAR") 
descriptors_names = data.filter(regex='^dd_[^a-z]').columns.values
#rmsd_ca end hbonds
data.plot.scatter(y="rmsd_ca",x="hbonds",ax=ax)

ax.grid()
ax.set_title("long simulation")
ax.set_ylabel("rmsd_ca")
ax.set_xlabel("hbonds")
#fig.savefig(root+"images/traj_bias"+str(k)+".png",dpi=300,facecolor="white",transparent=False)
plt.tight_layout()

In [None]:

fig,axs = plt.subplots(1,3,figsize=(14,4))
data = load_dataframe(set+"COLVAR")
data.plot.scatter(x="time",y="rmsd_ca",ax=axs[0])#, color=color[k])
data.plot.scatter(x="time",y="end",ax=axs[1])#, color=color[k])
data.plot.scatter(x="time",y="hbonds",ax=axs[2])

axs[0].set_title("long simulation")
plt.tight_layout()

### FES estimate 

In [None]:
'''
#-- estimation of Free Energy Surface --#
X = data[descriptors_names].to_numpy()
s = data[["rmsd_ca","hbonds"]].to_numpy()
logweight=np.zeros(len(data))
gridspec_fes(s,logweight,sim_parameters)
plt.show()
'''

## Deep-TICA Analysis

In [None]:
min_lag,max_lag = 20,5 #0.2,20
n = 1 # how many lag times between min and max lag
lags = np.linspace(min_lag,max_lag,n) #-- how many batches for the train and valid set of a single simulation
train_sim = 1 # number of previous simulations to train the NN
shuffle = False # if shuffle the data between batches
#-- train_datasets and valid_datasets list, it will be filled with new data every iteration
train_datasets = []
valid_datasets = []
# torch seed 
torch.manual_seed(21)

data = load_dataframe(set+"COLVAR")
print(data.head())
size = len(data)
descriptors_names = data.filter(regex='^dd[^a-z]').columns.values
print(descriptors_names)
print( len(descriptors_names) )

In [None]:
#-- TRAINING PARAMETERS --#
n_output = 5
n_input = len(descriptors_names) # can change..
train_parameters = {
              'descriptors': '^dd[^a-z]', # can change during simulation
              'nodes':[n_input,256,256,n_output],#[n_input,60,30,n_output],
              'activ_type': 'tanh',#'relu','selu','tanh'
              'lag_time':10, 
              'loss_type': 'sum', 
              'n_eig': n_output,
              'trainsize':0.7, 
              'lrate':1e-3,
              'l2_reg':0.,
              'num_epochs':400,
              'batchsize': -1, #---> è da fare sul train loder and valid loader
              'es_patience':10,
              'es_consecutive':True,
              'standardize_outputs':True,
              'standardize_inputs': True,
              'log_every':50,
              }

print("layers: ",train_parameters["nodes"])
# how many data in single batch, batchsize
n_train = int( size*train_parameters["trainsize"] )
n_valid = int( size*(1-train_parameters["trainsize"])-int(10*max_lag) )
print("training samples: ",n_train, "\t validation samples", n_valid)

# DEVICE
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t = data['time'].values
X = data[descriptors_names].values

# create time lagged dataset with different lag times
for lag in lags:
    #random split
    # TensorDataset (x_t,x_lag,w_t,w_lag)
    dataset = create_time_lagged_dataset(X,t=t,lag_time=lag,interval=[0,n_train+n_valid])
    train_data, valid_data = random_split(dataset,[n_train,n_valid])
    train_datasets.append(train_data)
    valid_datasets.append(valid_data)

# to not divide the set, it create a dataset composed by all the found couples with different lag times
#print(n*n_train)
#print(len(ConcatDataset(train_datasets)))

train_loader = FastTensorDataLoader(train_datasets, batch_size=n_train,shuffle=shuffle)
valid_loader = FastTensorDataLoader(valid_datasets, batch_size=n_valid,shuffle=shuffle)

In [None]:
#-- TRAIN --#
# MODEL
model = DeepTICA_CV(train_parameters['nodes'],activation=train_parameters['activ_type'],gaussian_random_initialization=True)
model.to(device)
# OPTIMIZER (Adam)
opt = torch.optim.Adam(model.parameters(), lr=train_parameters['lrate'], weight_decay=train_parameters['l2_reg'])
# lrscheduler
#model.set_LRScheduler(opt,min_lr=5e-5)
model.set_optimizer(opt)
if valid_loader is not None:
    # EarlyStopping
    model.set_earlystopping(patience=train_parameters['es_patience'],
                            min_delta=0.005,consecutive=train_parameters['es_consecutive'], save_best_model=True, log=False) 
# TRAIN
model.fit(train_loader=train_loader,valid_loader=valid_loader,
    standardize_inputs=train_parameters['standardize_inputs'],
    standardize_outputs=train_parameters['standardize_outputs'],
    loss_type=train_parameters['loss_type'],
    n_eig=train_parameters['n_eig'],
    nepochs=train_parameters['num_epochs'],
    info=False, log_every=train_parameters['log_every'])
#-- move the model back to cpu for convenience --#
model.to('cpu')
#-- export checkpoint (for loading the model back to python) and torchscript traced module --#
save_folder = set+"deeptica/"
try:
    os.mkdir(save_folder)
except:
    print("already exists")
#-- move to cpu before saving results --#
model.to("cpu")
model.export(save_folder)

In [None]:
plot_model_lossfunction(model)

In [None]:
X = data[descriptors_names].to_numpy()
data["cv1"] = np.transpose(model(torch.Tensor(X)).detach().cpu().numpy())[0]
data["cv2"] = np.transpose(model(torch.Tensor(X)).detach().cpu().numpy())[1]

fig,axs = plt.subplots(1,2,figsize=(12,4),sharey=True)
data.plot.hist(y="cv1",bins=20,ax=axs[0],density=True,color="b")
data.plot.hist(y="cv2",bins=20,ax=axs[1],density=True,color="b")

In [None]:
fig,axs = plt.subplots(1,3,figsize=(14,4))
data.plot.scatter(x="time",y="rmsd_ca",c="cv1",cmap="fessa",ax=axs[0])#, color=color[k])
data.plot.scatter(x="time",y="end",c="cv1",cmap="fessa",ax=axs[1])#, color=color[k])
data.plot.scatter(x="time",y="hbonds",c="cv1",cmap="fessa",ax=axs[2])

plt.tight_layout()

fig,axs = plt.subplots(1,3,figsize=(14,4))
data.plot.scatter(x="time",y="rmsd_ca",c="cv2",cmap="fessa",ax=axs[0])#, color=color[k])
data.plot.scatter(x="time",y="end",c="cv2",cmap="fessa",ax=axs[1])#, color=color[k])
data.plot.scatter(x="time",y="hbonds",c="cv2",cmap="fessa",ax=axs[2])

axs[0].set_title("long simulation")
plt.tight_layout()

In [None]:
fig,axs = plt.subplots(1,3,figsize=(18,6))
    
#rmsd_ca end hbonds
data.plot.hexbin(y="rmsd_ca",x="end",C="cv1",cmap="fessa",ax=axs[0])
axs[0].grid()
axs[0].set_title("unbias simulation, cv1")
axs[0].set_ylabel("rmsd_ca")
axs[0].set_xlabel("end")

data.plot.hexbin(y="rmsd_ca",x="hbonds",C="cv1",cmap="fessa",ax=axs[1])
axs[1].grid()
axs[1].set_ylabel("rmsd_ca")
axs[1].set_xlabel("hbonds")

data.plot.hexbin(y="hbonds",x="end",C="cv1",cmap="fessa",ax=axs[2])
axs[2].grid()
axs[2].set_ylabel("hbonds")
axs[2].set_xlabel("end")

plt.tight_layout()

In [None]:
fig,axs = plt.subplots(1,3,figsize=(18,6))
    
#rmsd_ca end hbonds
data.plot.hexbin(y="rmsd_ca",x="end",C="cv2",cmap="fessa",ax=axs[0])
axs[0].grid()
axs[0].set_title("unbias simulation, cv2")
axs[0].set_ylabel("rmsd_ca")
axs[0].set_xlabel("end")

data.plot.hexbin(y="rmsd_ca",x="hbonds",C="cv2",cmap="fessa",ax=axs[1])
axs[1].grid()
axs[1].set_ylabel("rmsd_ca")
axs[1].set_xlabel("hbonds")

data.plot.hexbin(y="hbonds",x="end",C="cv2",cmap="fessa",ax=axs[2])
axs[2].grid()
axs[2].set_ylabel("hbonds")
axs[2].set_xlabel("end")

plt.tight_layout()

In [None]:
#-- estimation of Free Energy Surface --#
s = data.filter(regex="^cv").to_numpy()
logweight=np.zeros(s.shape[0])

fig, ax = plt.subplots(figsize=(6,6))
for i in range(2):
    fes,grid,bounds,error = compute_fes(s[:,i], weights=np.exp(logweight),
                                        temp=sim_parameters["temp"],
                                        kbt=sim_parameters["kbt"],
                                        blocks=sim_parameters["blocks"],
                                        bandwidth=sim_parameters["bandwidth"],scale_by='range',
                                        plot=True, plot_max_fes=sim_parameters["plot_max_fes"], ax = ax)
ax.legend(["F(cv1) estimate","F(cv2) estimate"])   
ax.grid()
plt.tight_layout()
ax.set_xlabel(r"$(cv1,cv2)$")
ax.set_ylabel("FES [Kj/mol]")

plt.show()

In [None]:
sim_parameters["plot_max_fes"] = 40
s = data.filter(regex="^cv").to_numpy()[::1]
logweight=np.zeros(len(s))
fig,ax = plt.subplots(1,1,figsize=(10,10))

#-- 2D plot --#
fes,grid,bounds,error = compute_fes(s, weights=np.exp(logweight),
                                    temp=sim_parameters["temp"],
                                    kbt=sim_parameters["kbt"],
                                    blocks=sim_parameters["blocks"],
                                    bandwidth=sim_parameters["bandwidth"],scale_by='range'
                                    ,plot=True, ax = ax,plot_max_fes=sim_parameters["plot_max_fes"])

ax.grid()
ax.set_xlabel("Deep-TICA 2")
ax.set_ylabel("Deep-TICA 1")