### Base pair parameters bimodality at the hexamer level 
***

**Jupyter Notebook** designed to demonstrate the power of the **hexABC REST-API programmatic interface**. 

The **REST-API** is used to extract **base-pair step parameters** *(Rise, Shift, Slide, Roll, Twist, Tilt)* for specific **tetramer** sequences (e.g. TACT) in different **MD simulations** and different **sequences**. 

The workflow is powered by the [hexABC database REST API](https://mmb.irbbarcelona.org/webdev3/hexABC/rest)
***


### Importing auxiliary libraries

In [281]:
import requests
import urllib
import json
import plotly
import itertools
import ipywidgets
from IPython.display import display
from math import ceil

### Defining auxiliary functions

In [282]:
#
# find_seq: finding MD simulations containing a sequence fragment; Returns metadata for the systems found.
#
def find_seq(json_data, pattern):
    complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    complement = ''.join(complement_map[base] for base in reversed(pattern))

    matching_projects = []

    for project in json_data:
        watson_seq = project.get('sequences', [None])[0]

        if watson_seq and len(watson_seq) > 4:
            trimmed_seq = watson_seq[2:-2]  # Skip first and last 2 bases (flanking regions)
            positions = []

            for motif in [pattern, complement]:
                pos = trimmed_seq.find(motif)
                while pos != -1:
                    # Adjust position relative to original sequence
                    positions.append((motif, pos + 2))
                    pos = trimmed_seq.find(motif, pos + 1)

            if positions:
                matching_projects.append({
                    'id': project['id'],
                    'name': project['name'],
                    'sequence': watson_seq,
                    'positions': positions
                })

    return matching_projects

### Base REST-API URL

In [283]:
API_BASE_URL = "https://mmb.irbbarcelona.org/webdev3/hexABC/api"

### Getting projects info

Retrieving all the **projects metadata** from the **hexABC database**.

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects

In [284]:
url_get_projects = f'{API_BASE_URL}/projects?limit=1000'
with urllib.request.urlopen(url_get_projects) as response:
    r_projects = json.loads(response.read().decode("utf-8"))
print(json.dumps(r_projects, indent=4))

{
    "total": 380,
    "projects": [
        {
            "id": "seq001-1",
            "accession": "seq001-1",
            "name": "hexABC Sequence 001 - replica 1",
            "sequences": [
                "GCAAACTTGTATATGTGTGC",
                "GCACACATATACAAGTTTGC"
            ],
            "analyses": [
                "clusters",
                "clusters-00",
                "clusters-01",
                "dist-perres",
                "energies",
                "fluctuation",
                "hbonds",
                "interactions",
                "pca",
                "rgyr",
                "rmsd-pairwise",
                "rmsd-perres",
                "rmsds",
                "sasa",
                "helical"
            ]
        },
        {
            "id": "seq001-2",
            "accession": "seq001-2",
            "name": "hexABC Sequence 001 - replica 2",
            "sequences": [
                "GCAAACTTGTATATGTGTGC",
                "GCACACATATACAAGTTT

### Pagination

If the number of **projects** exceeds the default limit (50), **pagination** is needed. In this case, we need to loop over the returned pages to store all the desired information.  


In [285]:
# Set a list to store all the mined metadata
projects_metadata = []

# Set a list to store all the mined accession values
accessions = []

# Get the number of projects from the previous response
n_projects = r_projects['total']

# Set the limit of projects per page
limit = 100

# Calculate the expected number of pages
pages = ceil(n_projects / limit)

# Iterate over pages
for page in range(1, pages + 1):
    
    print(f'Requesting page {page}/{pages}', end='\r')
    
    # Set the URL for the projects endpoint
    # Include both limit and page parameters
    paginated_url = f'{API_BASE_URL}/projects?limit={limit}&page={page}'
    
    # Query the API
    with urllib.request.urlopen(paginated_url) as resp:
        response = json.loads(resp.read().decode("utf-8"))
        
        # Mine target data
        projects = response['projects']
        project_accessions = [ project['accession'] for project in projects]
        accessions += project_accessions
        projects_metadata = [*projects_metadata, *projects]
    
print(f'We have obtained metadata information for {len(accessions)} simulations')

We have obtained metadata information for 380 simulations


## TETRAMERS

### Select the tetramer

Selecting the desired **tetramer** from the list of all possible **DNA tetramers** (e.g. GTAT)

In [286]:
# Generate all possible DNA tetramers
bases = ['A', 'T', 'C', 'G']
tetramer_list = [''.join(p) for p in itertools.product(bases, repeat=4)]

mdsel = ipywidgets.Dropdown(
    options=tetramer_list,
    description='Sel. tetramer:',
    disabled=False,
    value='GTAT' # default value  
)
display(mdsel)

Dropdown(description='Sel. tetramer:', index=209, options=('AAAA', 'AAAT', 'AAAC', 'AAAG', 'AATA', 'AATT', 'AA…

### Searching for tetramers

Looking for specific **tetrameric sequence** within the **dataset**. 


In [287]:
tetramer = mdsel.value
results = find_seq(projects_metadata, tetramer)

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")


seq001-1 - hexABC Sequence 001 - replica 1
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'GTAT' at position 8
seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'GTAT' at position 8
seq029-1 - hexABC Sequence 029 - replica 1
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
seq029-2 - hexABC Sequence 029 - replica 2
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
seq030-1 - hexABC Sequence 030 - replica 1
  Watson strand: GCAGGAATAGTCGTATCAGC
    ↳ found 'GTAT' at position 12
seq030-2 - hexABC Sequence 030 - replica 2
  Watson strand: GCAGGAATAGTCGTATCAGC
    ↳ found 'GTAT' at position 12
seq038-1 - hexABC Sequence 038 - replica 1
  Watson strand: GCATACCATAGACACAAAGC
    ↳ found 'ATAC' at position 2
seq038-2 - hexABC Sequence 038 - replica 2
  Watson strand: GCATACCATAGACACAAAGC
    ↳ found 'ATAC' at position 2
seq039-1 - hexABC Sequence 039 - replica 1
  Watson strand: GCATACGGTCGATGCGGGGC
   

### Select the base pair step helical parameter pair

Selecting the desired **base pair step helical parameter pair** from the list of all possible **pairs** (e.g. roll-twist)

In [288]:
# Base pair step helical parameters 
hel_parms = ['rise', 'shift', 'slide', 'roll', 'twist', 'tilt']

# Generate all unique unordered pairs
hel_parm_pairs = [f"{a}-{b}" for a, b in itertools.combinations(hel_parms, 2)]

helsel = ipywidgets.Dropdown(
    options=hel_parm_pairs,
    description='Sel. pair:',
    disabled=False,
    value='shift-slide' # default value  #roll-twist
)
display(helsel)

Dropdown(description='Sel. pair:', index=5, options=('rise-shift', 'rise-slide', 'rise-roll', 'rise-twist', 'r…

### Extract base pair step parameter values

For each **tetramer** found, extract the **base pair step parameter** values **along time** from the **MD dataset**. 

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/{id}/analyses/time/{type}

In [289]:
param1, param2 = helsel.value.split('-')

aggregated_data_param1 = {}

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")

    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")

        pos_index = pos + 2
        url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/time/{param1}?base={pos_index}"
        print(url_param)
        with urllib.request.urlopen(url_param) as response:
            r_param1 = json.loads(response.read().decode("utf-8"))
            aggregated_data_param1.update(r_param1)

print("\n\n\nStarting PARAM 2...\n\n\n")

aggregated_data_param2 = {}

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
        
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")

        pos_index = pos + 2
        url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/time/{param2}?base={pos_index}"
        print(url_param)
        with urllib.request.urlopen(url_param) as response:
            r_param2 = json.loads(response.read().decode("utf-8"))
            aggregated_data_param2.update(r_param2)
    

seq001-1 - hexABC Sequence 001 - replica 1
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'GTAT' at position 8
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-1/analyses/time/shift?base=10
seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'GTAT' at position 8
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-2/analyses/time/shift?base=10
seq029-1 - hexABC Sequence 029 - replica 1
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq029-1/analyses/time/shift?base=14
seq029-2 - hexABC Sequence 029 - replica 2
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq029-2/analyses/time/shift?base=14
seq030-1 - hexABC Sequence 030 - replica 1
  Watson strand: GCAGGAATAGTCGTATCAGC
    ↳ found 'GTAT' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/pro

seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'GTAT' at position 8
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-2/analyses/time/slide?base=10
seq029-1 - hexABC Sequence 029 - replica 1
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq029-1/analyses/time/slide?base=14
seq029-2 - hexABC Sequence 029 - replica 2
  Watson strand: GCAGCTTTAGAGATACAGGC
    ↳ found 'ATAC' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq029-2/analyses/time/slide?base=14
seq030-1 - hexABC Sequence 030 - replica 1
  Watson strand: GCAGGAATAGTCGTATCAGC
    ↳ found 'GTAT' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq030-1/analyses/time/slide?base=14
seq030-2 - hexABC Sequence 030 - replica 2
  Watson strand: GCAGGAATAGTCGTATCAGC
    ↳ found 'GTAT' at position 12
https://mmb.irbbarcelona.org/webdev3/hexABC/api/pr

### Plotting values

Plotting **density maps** of the **base pair step helical parameter pair** for the **tetramer** on different **simulations**.

In [290]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde

units = {
    'rise': 'Å',
    'shift': 'Å',
    'slide': 'Å',
    'roll': '°',
    'twist': '°',
    'tilt': '°'
}

# Convert inputs to arrays
x = np.array(aggregated_data_param1[param1])
y = np.array(aggregated_data_param2[param2])

# KDE grid
kde = gaussian_kde([x, y])
xi, yi = np.mgrid[x.min():x.max():200j, y.min():y.max():200j]
coords = np.vstack([xi.ravel(), yi.ravel()])
zi = kde(coords).reshape(xi.shape)

# Subplot grid
fig = make_subplots(
    rows=2, cols=2,
    column_widths=[0.8, 0.2],
    row_heights=[0.2, 0.8],
    shared_xaxes=True,
    shared_yaxes=True,
    horizontal_spacing=0.02,
    vertical_spacing=0.02
)

# Central density plot (blue palette)
fig.add_trace(
    go.Contour(
        x=xi[:, 0], y=yi[0, :], z=zi.T,
        colorscale='Blues',
        contours_coloring='fill',
        showscale=False
    ),
    row=2, col=1
)

# Top marginal KDE (no fill, blue line)
x_kde = gaussian_kde(x)
x_vals = np.linspace(x.min(), x.max(), 200)
fig.add_trace(
    go.Scatter(x=x_vals, y=x_kde(x_vals), mode='lines', line=dict(color='royalblue'), fill='tozeroy',fillcolor='rgba(65, 105, 225, 0.3)'),
    row=1, col=1
)

# Right marginal KDE (no fill, blue line)
y_kde = gaussian_kde(y)
y_vals = np.linspace(y.min(), y.max(), 200)
fig.add_trace(
    go.Scatter(y=y_vals, x=y_kde(y_vals), mode='lines', line=dict(color='royalblue'), fill='tozerox', fillcolor='rgba(65, 105, 225, 0.3)'),
    row=2, col=2
)


# Clean background and axis labels
fig.update_layout(
    title=tetramer,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='white',
    showlegend=False,
    margin=dict(t=60, r=60, b=60, l=60),
    yaxis1=dict(showticklabels=False),  # Hide y-ticks on top marginal
    xaxis4=dict(showticklabels=False)   # Hide x-ticks on right marginal
)

fig.layout['xaxis3'].title = f'{param1} ({units[param1]})'  # x-axis title
fig.layout['yaxis3'].title = f'{param2} ({units[param2]})'  # y-axis title


fig.show()


## HEXAMERS

### Select the hexamer

Selecting the desired **hexamer** from the list of all possible **DNA hexamers** (e.g. TGTATA)

In [291]:
# Generate all possible DNA tetramers
bases = ['A', 'T', 'C', 'G']
hexamer_list = [''.join(p) for p in itertools.product(bases, repeat=6)]

mdsel = ipywidgets.Dropdown(
    options=hexamer_list,
    description='Sel. hexamer:',
    disabled=False,
    value='TGTATA' # default value 
)
display(mdsel)

Dropdown(description='Sel. hexamer:', index=1860, options=('AAAAAA', 'AAAAAT', 'AAAAAC', 'AAAAAG', 'AAAATA', '…

### Searching for hexamers

Looking for specific **hexameric sequence** within the **dataset**. 


In [292]:
hexamer = mdsel.value
results = find_seq(projects_metadata, hexamer)

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")


seq001-1 - hexABC Sequence 001 - replica 1
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7
seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7


### Select the base pair step helical parameter pair

Selecting the desired **base pair step helical parameter pair** from the list of all possible **pairs** (e.g. slide-roll)

In [293]:
# Base pair step helical parameters 
hel_parms = ['rise', 'shift', 'slide', 'roll', 'twist', 'tilt']

# Generate all unique unordered pairs
hel_parm_pairs = [f"{a}-{b}" for a, b in itertools.combinations(hel_parms, 2)]

helsel = ipywidgets.Dropdown(
    options=hel_parm_pairs,
    description='Sel. pair:',
    disabled=False,
    value='shift-slide' # default value
)
display(helsel)

Dropdown(description='Sel. pair:', index=5, options=('rise-shift', 'rise-slide', 'rise-roll', 'rise-twist', 'r…

### Extract base pair step parameter values

For each **hexamer** found, extract the **base pair step parameter** values **along time** from the **MD dataset**. 

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/{id}/analyses/time/{type}

In [294]:
param1, param2 = helsel.value.split('-')

aggregated_data_param1 = {}

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
        
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")
        
        pos_index = pos + 3
        url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/time/{param1}?base={pos_index}"
        print(url_param)
        with urllib.request.urlopen(url_param) as response:
            r_param1 = json.loads(response.read().decode("utf-8"))
            aggregated_data_param1.update(r_param1)

print("\n\n\nStarting PARAM 2...\n\n\n")

aggregated_data_param2 = {}

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
    
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")
        
        pos_index = pos + 3
        url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/time/{param2}?base={pos_index}"
        print(url_param)
        with urllib.request.urlopen(url_param) as response:
            r_param2 = json.loads(response.read().decode("utf-8"))
            aggregated_data_param2.update(r_param2)
    

seq001-1 - hexABC Sequence 001 - replica 1
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-1/analyses/time/shift?base=10
seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-2/analyses/time/shift?base=10



Starting PARAM 2...



seq001-1 - hexABC Sequence 001 - replica 1
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-1/analyses/time/slide?base=10
seq001-2 - hexABC Sequence 001 - replica 2
  Watson strand: GCAAACTTGTATATGTGTGC
    ↳ found 'TGTATA' at position 7
https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/seq001-2/analyses/time/slide?base=10


### Plotting values

Plotting **density maps** of the **base pair step helical parameter pair** for the **tetramer** on different **simulations**.

In [295]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde

units = {
    'rise': 'Å',
    'shift': 'Å',
    'slide': 'Å',
    'roll': '°',
    'twist': '°',
    'tilt': '°'
}

# Convert inputs to arrays
x = np.array(aggregated_data_param1[param1])
y = np.array(aggregated_data_param2[param2])

# KDE grid
kde = gaussian_kde([x, y])
xi, yi = np.mgrid[x.min():x.max():200j, y.min():y.max():200j]
coords = np.vstack([xi.ravel(), yi.ravel()])
zi = kde(coords).reshape(xi.shape)

# Subplot grid
fig = make_subplots(
    rows=2, cols=2,
    column_widths=[0.8, 0.2],
    row_heights=[0.2, 0.8],
    shared_xaxes=True,
    shared_yaxes=True,
    horizontal_spacing=0.02,
    vertical_spacing=0.02
)

# Central density plot (blue palette)
fig.add_trace(
    go.Contour(
        x=xi[:, 0], y=yi[0, :], z=zi.T,
        colorscale='Blues',
        contours_coloring='fill',
        showscale=False
    ),
    row=2, col=1
)

# Top marginal KDE (no fill, blue line)
x_kde = gaussian_kde(x)
x_vals = np.linspace(x.min(), x.max(), 200)
fig.add_trace(
    go.Scatter(x=x_vals, y=x_kde(x_vals), mode='lines', line=dict(color='royalblue'), fill='tozeroy',fillcolor='rgba(65, 105, 225, 0.3)'),
    row=1, col=1
)

# Right marginal KDE (no fill, blue line)
y_kde = gaussian_kde(y)
y_vals = np.linspace(y.min(), y.max(), 200)
fig.add_trace(
    go.Scatter(y=y_vals, x=y_kde(y_vals), mode='lines', line=dict(color='royalblue'), fill='tozerox', fillcolor='rgba(65, 105, 225, 0.3)'),
    row=2, col=2
)

# Clean background and axis labels
fig.update_layout(
    title=hexamer,
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='white',
    showlegend=False,
    margin=dict(t=60, r=60, b=60, l=60),
    yaxis1=dict(showticklabels=False),  # Hide y-ticks on top marginal
    xaxis4=dict(showticklabels=False)   # Hide x-ticks on right marginal
)

fig.layout['xaxis3'].title = f'{param1} ({units[param1]})'  # x-axis title
fig.layout['yaxis3'].title = f'{param2} ({units[param2]})'  # y-axis title


fig.show()
