### How to quantify the best performing simulation:

- If you are interested in stability, look for simulations with minimal fluctuations in **temperature**, **pressure**, and **total energy**, combined with a **potential energy** indicative of a stable, equilibrated system.

- For simulations aimed at eploring thermodynamic properties, metrics like **enthalpy** and **density** at equilibrium can be valuable. 

Terms to analyze in the energy file. `.edr`

- **Potential Energy:** Gives an insight into the stability of the system. Lower and more stable potential energy often indicates a well-equilibriated system. 
- **Total Energy:** (The sum of potential and kinetic energies) Should remain constant or relatively stable. Provides a check for energy conservation and overall simulation stability. 
- **Density:** Can indicate whether the system has reached an equilibrium state at the expected density for the given temperature. 

Other Terms 
- **RMSD:** Common metric for assessing structural stability. 


## Potential Energy

In [None]:
# iterate over hotkey folders and extract potential energy
import os 
import subprocess 

miner_paths = ['data/5oxe/max_steps_5000/miners']
for loc in miner_paths:
    hotkeys = os.listdir(loc)
    for key in hotkeys:
        hotkey_path = os.path.join(loc, key)
        # Use subprocess to call gmx energy
        edr_file = os.path.join(hotkey_path, 'md_0_1.edr')
        output_file = os.path.join(hotkey_path, 'potential.xvg')
        command = f"echo Potential | gmx energy -f {edr_file} -o {output_file}"
        try:
            subprocess.run(command, shell=True, check=True, text=True)
        except subprocess.CalledProcessError as e:
            print(f"Error running command '{command}': {e}")


In [None]:
import pandas as pd
import plotly.express as px
import plotly.io as pio
import os
 
def find_and_plot(energy_term: str, miner_paths: str):
    # create a list of paths: paths
    miner_paths = [f'{miner_paths}']
    paths = []
    for loc in miner_paths: # for each path in the list 
        hotkeys = ! ls {loc} # list the names of all the folders 
        for key in hotkeys: # for each folder 
            hotkey_path = os.path.join(loc, key)
            paths.append(hotkey_path)

    # create a list of paths to the {energy_term}.xvg files: file_paths_list
    file_paths_list=[]
    for file in paths: 
        file_paths_list.append(file + f'/{energy_term}.xvg')
    file_paths_list

    # create dfs for each {energy_term}.xvg file
    data_frames = []
    for path in file_paths_list:
        df = pd.read_csv(path, skiprows=24, delim_whitespace=True, names=["time", f"{energy_term}"])
        df['hotkey'] = path[30:35]
        data_frames.append(df)
    dfs = pd.concat(data_frames)

    # plot {energy_term} vs time 
    fig = px.line(dfs, x='time', y= f'{energy_term}', color='hotkey')
    fig.update_layout(title='{energy_term} vs Time')
    fig.write_image(f"{miner_paths}/{energy_term}.png")
    return fig.show()

# example 
    
find_and_plot(energy_term = 'potential', miner_paths = '/root/sergio/folding/data/5oxe/max_steps_5000/miners')

In [None]:
# plot potential energy vs time 
import plotly.express as px
import plotly.io as pio

fig = px.line(dfs, x='time', y='potential_energy', color='hotkey')
fig.update_layout(title='Potential Energy vs Time for Protein 5oxe')
fig.show()
fig.write_image("data/5oxe/max_steps_50/miners/potential_energy.png")

## Total Energy 


In [None]:
# total energy extraction from .egr file 

import os
import subprocess

miner_paths = ['data/5oxe/max_steps_50/miners']
for loc in miner_paths:
    hotkeys = os.listdir(loc)
    for key in hotkeys:
        hotkey_path = os.path.join(loc, key)
        # Define the command as a list
        cmd = [
            'gmx', 'energy',
            '-f', os.path.join(hotkey_path, 'md_0_1.edr'),
            '-o', os.path.join(hotkey_path, 'total.xvg'),
        ]
        # Open a subprocess to run the command
        process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        # Send 'Protein' twice to the process stdin due to GROMACS group selection prompts
        output, errors = process.communicate(input='Total-Energy\n')
        if process.returncode == 0:
            print(f"Total Energy Extraction successful for {key}")
        else:
            print(f"Total Energy Extraction unsuccessful for {key}: {errors}")



In [None]:
# create a list of paths: paths 

miner_paths = ['data/5oxe/max_steps_50/miners']
paths = []
for loc in miner_paths: # for each path in the list 
    hotkeys = ! ls {loc} # list the names of all the folders 
    for key in hotkeys: # for each folder 
        hotkey_path = os.path.join(loc, key)
        paths.append(hotkey_path)

# create a list of paths to the potential.xvg files: potential_paths
total_paths=[]

for file in paths: 
    total_paths.append(file + '/total.xvg')

total_paths

In [None]:
# create dfs for each total.xvg file
import pandas as pd

data_frames = []
for path in total_paths:
    df = pd.read_csv(path, skiprows=24, delim_whitespace=True, names=["time", "total_energy"])
    df['hotkey'] = path[30:35]
    data_frames.append(df)

dfs = pd.concat(data_frames)

In [None]:
# plot total energy vs time 
import plotly.express as px
import plotly.io as pio

fig = px.line(dfs, x='time', y='total_energy', color='hotkey')
fig.update_layout(title='Total Energy vs Time for Protein 5oxe')
fig.show()
fig.write_image("data/5oxe/max_steps_50/miners/total_energy.png")

## RMSD

- `-s` specifies the reference structure (often the .tpr)
- `-f` specifies the trajectory file (.xtc or .trr)
- `-o` specifies the output file of the RMSD value (in .xvg format)
- `-tu ns` specifies the time unit fore the x-axis in the plot (nanoseconds) 

The cell below produces the `rmsd.xvg` files. It is important to specify a subprocess in the function so that we can specify the group for the least squares fitting process, as well as the actual RMSD calculation (which atoms to consider when computing the RMSD between the aligned structures.)
1. Selecting 'Protein" for the lsq indicates that the least squares fir should be performed based on the entire protein's atomic positions. 
2. Selecting 'Protein' for the RMSD calculation tells GROMACS which atoms to consider when computing the RMSD between the aligned structures.

Focusing on the whole protien i.e. choosing 'Protein' for both of the above choices provides a general view of structural stability and conformational changes. Selecting other specific groups yield insights into more detailed aspects of protein dynamics. **Comparing these metrics across simulations can help identify the most stable conformations**

Example Output: 

```
(venv) root@fd7a38380ffb:~/sergio/folding# gmx rms -s data/5oxe/max_steps_50/miners/5f6er/md_0_1.tpr -f data/5oxe/max_steps_50/miners/5f6er/md_0_1.trr -o data/5oxe/max_steps_50/miners/5f6er/rmsd.xvg -tu ns
                       :-) GROMACS - gmx rms, 2024.1 (-:

Executable:   /usr/local/gromacs/bin/gmx
Data prefix:  /usr/local/gromacs
Working dir:  /root/sergio/folding
Command line:
  gmx rms -s data/5oxe/max_steps_50/miners/5f6er/md_0_1.tpr -f data/5oxe/max_steps_50/miners/5f6er/md_0_1.trr -o data/5oxe/max_steps_50/miners/5f6er/rmsd.xvg -tu ns

Reading file data/5oxe/max_steps_50/miners/5f6er/md_0_1.tpr, VERSION 2024.1 (single precision)
Reading file data/5oxe/max_steps_50/miners/5f6er/md_0_1.tpr, VERSION 2024.1 (single precision)
Select group for least squares fit
Group     0 (         System) has 44747 elements
Group     1 (        Protein) has  1037 elements
Group     2 (      Protein-H) has   505 elements
Group     3 (        C-alpha) has    70 elements
Group     4 (       Backbone) has   210 elements
Group     5 (      MainChain) has   279 elements
Group     6 (   MainChain+Cb) has   339 elements
Group     7 (    MainChain+H) has   348 elements
Group     8 (      SideChain) has   689 elements
Group     9 (    SideChain-H) has   226 elements
Group    10 (    Prot-Masses) has  1037 elements
Group    11 (    non-Protein) has 43710 elements
Group    12 (          Water) has 43707 elements
Group    13 (            SOL) has 43707 elements
Group    14 (      non-Water) has  1040 elements
Group    15 (            Ion) has     3 elements
Group    16 ( Water_and_ions) has 43710 elements
Select a group: Protein
Selected 1: 'Protein'
Select group for RMSD calculation
Group     0 (         System) has 44747 elements
Group     1 (        Protein) has  1037 elements
Group     2 (      Protein-H) has   505 elements
Group     3 (        C-alpha) has    70 elements
Group     4 (       Backbone) has   210 elements
Group     5 (      MainChain) has   279 elements
Group     6 (   MainChain+Cb) has   339 elements
Group     7 (    MainChain+H) has   348 elements
Group     8 (      SideChain) has   689 elements
Group     9 (    SideChain-H) has   226 elements
Group    10 (    Prot-Masses) has  1037 elements
Group    11 (    non-Protein) has 43710 elements
Group    12 (          Water) has 43707 elements
Group    13 (            SOL) has 43707 elements
Group    14 (      non-Water) has  1040 elements
Group    15 (            Ion) has     3 elements
Group    16 ( Water_and_ions) has 43710 elements
Select a group: Protein
Selected 1: 'Protein'
trr version: GMX_trn_file (single precision)
Last frame         50 time    0.00
```

In [None]:
# Create RMSD Files 

import os
import subprocess

miner_paths = ['data/5oxe/max_steps_50/miners']
for loc in miner_paths:
    hotkeys = os.listdir(loc)
    for key in hotkeys:
        hotkey_path = os.path.join(loc, key)
        # Define the command as a list
        cmd = [
            'gmx', 'rms',
            '-s', os.path.join(hotkey_path, 'md_0_1.tpr'),
            '-f', os.path.join(hotkey_path, 'md_0_1.trr'),
            '-o', os.path.join(hotkey_path, 'rmsd.xvg'),
            '-tu', 'ns'
        ]
        # Open a subprocess to run the command
        process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        # Send 'Protein' twice to the process stdin due to GROMACS group selection prompts
        output, errors = process.communicate(input='Protein\nProtein\n')
        if process.returncode == 0:
            print(f"RMSD calculation successful for {key}")
        else:
            print(f"Error in RMSD calculation for {key}: {errors}")



In [None]:
# create a list of paths: paths 
miner_paths = ['data/5oxe/max_steps_50/miners']
paths = []
for loc in miner_paths: # for each path in the list 
    hotkeys = ! ls {loc} # list the names of all the folders 
    for key in hotkeys: # for each folder 
        hotkey_path = os.path.join(loc, key)
        paths.append(hotkey_path)

# create a list of paths to the potential.xvg files: potential_paths
rmsd_paths=[]

for file in paths: 
    rmsd_paths.append(file + '/rmsd.xvg')

rmsd_paths

In [None]:
# Create DFs out of rmsd data 
import pandas as pd

data_frames = []
for path in rmsd_paths:
    df = pd.read_csv(path, skiprows=24, delim_whitespace=True, names=["time", "rmsd"])
    df['hotkey'] = path[30:35]
    data_frames.append(df)

dfs = pd.concat(data_frames)
dfs


In [None]:
# plot RMSD
import plotly.express as px
import plotly.io as pio

fig = px.line(dfs, x='time', y='rmsd', color='hotkey')
fig.update_layout(title='RMSD vs Time for Protein 5oxe')
fig.show()
fig.write_image("data/5oxe/max_steps_50/miners/rmsd.png")