### Hydrogen Bonding 
***

**Jupyter Notebook** designed to demonstrate the power of the **hexABC REST-API programmatic interface**. 

The **REST-API** is used to extract **Hydrogen Bonds** statistics (nº of HBs, lifetimes) for specific **base pairs** (e.g. T-A, G-C) in different **MD simulations** and different **trimers** (e.g. TCA). The example is focused on **terminal trimers**, to study **fraying events**.

The workflow is powered by the [hexABC database REST API](https://mmb.irbbarcelona.org/webdev3/hexABC/rest)
***

### Importing auxiliary libraries

In [None]:
import requests
import urllib
import json
import plotly
import itertools
import ipywidgets
from IPython.display import display
from math import ceil

### Defining auxiliary functions

In [None]:
#
# find_seq_term: finding MD simulations containing a sequence fragment; Returns metadata for the systems found.
#
def find_seq_term(json_data, pattern):
    complement_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    complement = ''.join(complement_map[base] for base in reversed(pattern))

    matching_projects = []

    for project in json_data:
        watson_seq = project.get('sequences', [None])[0]

        if watson_seq and len(watson_seq) > 4:
            positions = []

            for motif in [pattern, complement]:
                if watson_seq.startswith(motif):
                    positions.append((motif, 2))
                if watson_seq.endswith(motif):
                    # End position is the start index of the motif at the end
                    pos = len(watson_seq) - len(motif)
                    positions.append((motif, pos + 2))  

            if positions:
                matching_projects.append({
                    'id': project['id'],
                    'name': project['name'],
                    'sequence': watson_seq,
                    'positions': positions
                })

    return matching_projects


### Base REST-API URL

In [None]:
API_BASE_URL = "https://mmb.irbbarcelona.org/webdev3/hexABC/api"

### Getting projects info

Retrieving all the **projects metadata** from the **hexABC database**.

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects

In [None]:
url_get_projects = f'{API_BASE_URL}/projects?limit=1000'
with urllib.request.urlopen(url_get_projects) as response:
    r_projects = json.loads(response.read().decode("utf-8"))
print(json.dumps(r_projects, indent=4))

### Pagination

If the number of **projects** exceeds the default limit (50), **pagination** is needed. In this case, we need to loop over the returned pages to store all the desired information.  


In [None]:
# Set a list to store all the mined metadata
projects_metadata = []

# Set a list to store all the mined accession values
accessions = []

# Get the number of projects from the previous response
n_projects = r_projects['total']

# Set the limit of projects per page
limit = 100

# Calculate the expected number of pages
pages = ceil(n_projects / limit)

# Iterate over pages
for page in range(1, pages + 1):
    
    print(f'Requesting page {page}/{pages}', end='\r')
    
    # Set the URL for the projects endpoint
    # Include both limit and page parameters
    paginated_url = f'{API_BASE_URL}/projects?limit={limit}&page={page}'
    
    # Query the API
    with urllib.request.urlopen(paginated_url) as resp:
        response = json.loads(resp.read().decode("utf-8"))
        
        # Mine target data
        projects = response['projects']
        project_accessions = [ project['accession'] for project in projects]
        accessions += project_accessions
        projects_metadata = [*projects_metadata, *projects]
    
print(f'We have obtained metadata information for {len(accessions)} simulations')

## TRIMERS

### Select the trimers

Selecting the desired **trimers** from the list of all possible **DNA trimers** (e.g. GCG)

In [None]:
# Generate all possible DNA trimers
bases = ['A', 'T', 'C', 'G']
trimer_list = [''.join(p) for p in itertools.product(bases, repeat=3)]

mdsel = ipywidgets.Dropdown(
    options=trimer_list,
    description='Sel. trimer:',
    disabled=False,
    value='GCG' # default value
)
display(mdsel)

### Searching for trimers

Looking for specific **trimer sequence** within the **dataset**. 


In [None]:
trimer = mdsel.value
results = find_seq_term(projects_metadata, trimer)

for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")


### Extract hydrogen bond values

For each **trimer** found, extract the **hydrogen bond** values **along time** for the central nucleotide (e.g. G**C**G). 

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/{id}/analyses/hbonds

In [None]:
from collections import Counter, defaultdict
from itertools import groupby

aggregated_hbs = []
for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
    
    url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/hbonds?from=1&to=-1&factor=500"
    print(url_param)
    
    with urllib.request.urlopen(url_param) as response:
        hbs = json.loads(response.read().decode("utf-8"))
        
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")

        # Only interested in terminal (flanking) nucleotides
        if (pos > 2 and pos < 19):
            continue

        pos_index = pos 
        code = f"{pos_index}{motif[1]}"
        print("Code: " + code)

        # Complementary case
        if motif!=trimer:
            #pos_index = pos + 2
            code = f"{pos_index}{motif[1]}"
            print("Code -rev-: " + code)
        
        for bp_item in hbs['hbonds']:
            #print(json.dumps(bp_item['bp'], indent=4))
            
            print(bp_item['bp'])
            if (bp_item['bp'].startswith(code)):
                list_hbonds = bp_item['hbonds']
                aggregated_hbs.extend(list_hbonds)
                #print(list_hbonds)
                print(len(list_hbonds))
                
                # Percentages
                counts = Counter(list_hbonds)
                total = len(list_hbonds)
                percentages = {k: round((v / total) * 100, 2) for k, v in counts.items()}

                # Lifetimes
                lifetimes = defaultdict(list)
                for value, group in groupby(list_hbonds):
                    length = len(list(group))
                    lifetimes[value].append(length)

                # Output
                print("Percentages:")
                for value in sorted(percentages):
                    print(f"  {value}: {percentages[value]}%")

                # Output min and max per state
                print("Min and Max Lifetimes:")
                for state in sorted(lifetimes):
                    durations = lifetimes[state]
                    print(f"  {state}: min = {min(durations)}, max = {max(durations)}")
    
print("FINAL NUMBERS:")

# Percentages
counts = Counter(aggregated_hbs)
total = len(aggregated_hbs)
percentages = {k: round((v / total) * 100, 2) for k, v in counts.items()}

# Output
print("Percentages:")
for value in sorted(percentages):
    print(f"  {value}: {percentages[value]}%")

### Plotting hydrogen bond values

**Pie Chart** representing HBs statistics for the specific **trimer**. <br>
Less than 3 HBs (for C,G) or 2 HBs (for A,T) means **unstable/broken base pairs**. 

In [None]:
import plotly.graph_objects as go

# Filter and prep labels and values
labels = [f"{k} HBs" for k, v in percentages.items()]
values = list(percentages.values())

fig = go.Figure(data=[go.Pie(
    labels=labels,
    values=values,
    hole=0.4,  # Makes it a donut
    pull=[0.05 if v < 5 else 0 for v in values],  # Emphasize smaller slices
    textinfo='label+percent',
    marker=dict(colors=['#FFDD57', '#FF6B6B', '#4ECDC4', '#556270'])
)])

fig.update_layout(
    title_text=f"State Distribution for {trimer} trimer",
    title_font_size=20,
    showlegend=True,
    annotations=[
        dict(text=f"{trimer}", x=0.5, y=0.5, font_size=14, showarrow=False),
    ]
)

fig.show()


## TRIMERS COMPARISON

### Select the trimers for comparison

Selecting the desired **trimers** from the list of all possible **DNA trimers** (e.g. GCC - GCG)

In [None]:
# Generate all possible DNA trimers
bases = ['A', 'T', 'C', 'G']
trimer_list = [''.join(p) for p in itertools.product(bases, repeat=3)]
trimer_pairs_list = [f"{a}-{b}" for a, b in itertools.combinations(trimer_list, 2)]

mdsel = ipywidgets.Dropdown(
    options=trimer_pairs_list,
    description='Sel. trimer pair:',
    disabled=False,
    value='GCC-GCG' # default value
)
display(mdsel)

### Extract hydrogen bond values

For each **trimer** found, extract the **hydrogen bond** values **along time** for the central nucleotide (e.g. G**C**G). 

* Endpoint used: https://mmb.irbbarcelona.org/webdev3/hexABC/api/projects/{id}/analyses/hbonds

In [None]:
from collections import Counter, defaultdict
from itertools import groupby

trimer1, trimer2 = mdsel.value.split('-')

results = find_seq_term(projects_metadata, trimer1)

aggregated_hbs_trimer1 = []
aggregated_hbs_lt_trimer1 = defaultdict(list)
for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")

    url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/hbonds?from=1&to=-1&factor=500"
    print(url_param)
    
    with urllib.request.urlopen(url_param) as response:
        hbs = json.loads(response.read().decode("utf-8"))
        
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")

        # Only interested in terminal (flanking) nucleotides
        if (pos > 2 and pos < 19):
            continue
        
        pos_index = pos
        code = f"{pos_index}{motif[1]}"
        print("Code: " + code)

        # Complementary case
        if motif!=trimer:
            code = f"{pos_index}{motif[1]}"
            print("Code -rev-: " + code)
        
        for bp_item in hbs['hbonds']:
            
            print(bp_item['bp'])
            if (bp_item['bp'].startswith(code)):
                list_hbonds = bp_item['hbonds']
                aggregated_hbs_trimer1.extend(list_hbonds)
                
                # Percentages
                counts = Counter(list_hbonds)
                total = len(list_hbonds)
                percentages = {k: round((v / total) * 100, 2) for k, v in counts.items()}

                # Lifetimes
                lifetimes = defaultdict(list)
                for value, group in groupby(list_hbonds):
                    length = len(list(group))
                    lifetimes[value].append(length)
                    aggregated_hbs_lt_trimer1[value].extend([length])

                # Output
                print("Percentages:")
                for value in sorted(percentages):
                    print(f"  {value}: {percentages[value]}%")

                # Output min and max per state
                print("Min and Max Lifetimes:")
                for state in sorted(lifetimes):
                    durations = lifetimes[state]
                    print(f"  {state}: min = {min(durations)}, max = {max(durations)}")
    
print("FINAL NUMBERS:")

# Percentages
counts = Counter(aggregated_hbs_trimer1)
total = len(aggregated_hbs_trimer1)
percentages_trimer1 = {k: round((v / total) * 100, 2) for k, v in counts.items()}

# Output
print("Percentages Trimer 1:")
for value in sorted(percentages_trimer1):
    print(f"  {value}: {percentages_trimer1[value]}%")

    
###############
# TRIMER 2
###############
    
results = find_seq_term(projects_metadata, trimer2)

aggregated_hbs_trimer2 = []
aggregated_hbs_lt_trimer2 = defaultdict(list)
for result in results:
    print(f"{result['id']} - {result['name']}")
    print(f"  Watson strand: {result['sequence']}")
        
    url_param = f"{API_BASE_URL}/projects/{result['id']}/analyses/hbonds?from=1&to=-1&factor=500"
    print(url_param)
    
    with urllib.request.urlopen(url_param) as response:
        hbs = json.loads(response.read().decode("utf-8"))
        
    for motif, pos in result['positions']:
        print(f"    ↳ found '{motif}' at position {pos}")

        # Only interested in terminal (flanking) nucleotides
        if (pos > 2 and pos < 19):
            continue

        pos_index = pos
        code = f"{pos_index}{motif[1]}"
        print("Code: " + code)

        # Complementary case
        if motif!=trimer:
            code = f"{pos_index}{motif[1]}"
            print("Code -rev-: " + code)
        
        for bp_item in hbs['hbonds']:
            
            print(bp_item['bp'])
            if (bp_item['bp'].startswith(code)):
                list_hbonds = bp_item['hbonds']
                aggregated_hbs_trimer2.extend(list_hbonds)
                
                # Percentages
                counts = Counter(list_hbonds)
                total = len(list_hbonds)
                percentages = {k: round((v / total) * 100, 2) for k, v in counts.items()}

                # Lifetimes
                lifetimes = defaultdict(list)
                for value, group in groupby(list_hbonds):
                    length = len(list(group))
                    lifetimes[value].append(length)
                    aggregated_hbs_lt_trimer2[value].extend([length])

                # Output
                print("Percentages:")
                for value in sorted(percentages):
                    print(f"  {value}: {percentages[value]}%")

                # Output min and max per state
                print("Min and Max Lifetimes:")
                for state in sorted(lifetimes):
                    durations = lifetimes[state]
                    print(f"  {state}: min = {min(durations)}, max = {max(durations)}")
    
print("FINAL NUMBERS:")

# Percentages
counts = Counter(aggregated_hbs_trimer2)
total = len(aggregated_hbs_trimer2)
percentages_trimer2 = {k: round((v / total) * 100, 2) for k, v in counts.items()}

# Output
print("Percentages Trimer 2:")
for value in sorted(percentages_trimer2):
    print(f"  {value}: {percentages_trimer2[value]}%")
    
# Output min and max per state
print("Min and Max Lifetimes Trimer 1:")
for state in sorted(aggregated_hbs_lt_trimer1):
    durations = aggregated_hbs_lt_trimer1[state]
    print(f"  {state}: min = {min(durations)}, max = {max(durations)}")

# Output min and max per state
print("Min and Max Lifetimes Trimer 2:")
for state in sorted(aggregated_hbs_lt_trimer2):
    durations = aggregated_hbs_lt_trimer2[state]
    print(f"  {state}: min = {min(durations)}, max = {max(durations)}")


### Plotting hydrogen bond values

**Pie Charts** representing HBs statistics for the specific **trimers**. <br>
Less than 3 HBs (for C,G) or 2 HBs (for A,T) means **unstable/broken base pairs**. 

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

color_map = {
    0: '#1f77b4',  # blue
    1: '#ff7f0e',  # orange
    2: '#2ca02c',  # green
    3: '#d62728'   # red
}

labels_1 = [f"{k} HBs" for k in percentages_trimer1]
values_1 = list(percentages_trimer1.values())
colors_1 = [color_map[k] for k in percentages_trimer1]

labels_2 = [f"{k} HBs" for k in percentages_trimer2]
values_2 = list(percentages_trimer2.values())
colors_2 = [color_map[k] for k in percentages_trimer2]

# Create subplot layout for 2 pies
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])

fig.add_trace(go.Pie(
    labels=labels_1,
    values=values_1,
    hole=0.4,
    textinfo='label+percent',
    marker=dict(colors=colors_1),
    pull=[0.05 if v < 5 else 0 for v in values_1],
    name="Study 1"
), row=1, col=1)

fig.add_trace(go.Pie(
    labels=labels_2,
    values=values_2,
    hole=0.4,
    textinfo='label+percent',
    marker=dict(colors=colors_2),
    pull=[0.05 if v < 5 else 0 for v in values_2],
    name="Study 2"
), row=1, col=2)


# Add titles
tit = f"{trimer1}-{trimer2}"
fig.update_layout(
    title={
        'text': tit,
        'x': 0.45,  # Left-align
        'y': 0.98,
        'xanchor': 'left',
        'yanchor': 'top',
        'font': dict(size=22)
    },
    annotations=[
        dict(text=f"{trimer1}", x=0.205, y=0.5, font_size=14, showarrow=False),
        dict(text=f"{trimer2}", x=0.795, y=0.5, font_size=14, showarrow=False)
    ]
)

fig.show()


### Plotting hydrogen bond lifetimes 

**Histograms** representing HBs lifetimes for the specific **trimers**. <br>


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

states = sorted(aggregated_hbs_lt_trimer1.keys())
n_states = len(states)

# Set up subplots: one row, multiple columns
fig = make_subplots(rows=1, cols=n_states, subplot_titles=[f"{s} HBs" for s in states])

# Add one histogram per state
for i, state in enumerate(states):
    lifetimes = aggregated_hbs_lt_trimer1[state]
    fig.add_trace(
        go.Histogram(
            x=lifetimes,
            nbinsx=min(20, max(lifetimes) - min(lifetimes) + 1),
            marker_color='#1f77b4',
            name=f"{state} HBs",
            showlegend=False
        ),
        row=1,
        col=i+1
    )

# Layout polish
fig.update_layout(
    title_text=f"Lifetime Distributions per State - {trimer1}",
    height=400,
    width=300 * n_states,
    bargap=0.1,
    margin=dict(t=50, r=20, l=20, b=40)
)

# Add axis labels uniformly
for i in range(n_states):
    fig.update_xaxes(title_text="Lifetime (frames)", row=1, col=i+1)
    fig.update_yaxes(title_text="Frequency", row=1, col=1)

fig.show()


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

states = sorted(aggregated_hbs_lt_trimer2.keys())
n_states = len(states)

# Set up subplots: one row, multiple columns
fig = make_subplots(rows=1, cols=n_states, subplot_titles=[f"{s} HBs" for s in states])

# Add one histogram per state
for i, state in enumerate(states):
    lifetimes = aggregated_hbs_lt_trimer2[state]
    fig.add_trace(
        go.Histogram(
            x=lifetimes,
            nbinsx=min(20, max(lifetimes) - min(lifetimes) + 1),
            marker_color='#1f77b4',
            name=f"{state} HBs",
            showlegend=False
        ),
        row=1,
        col=i+1
    )

# Layout polish
fig.update_layout(
     title_text=f"Lifetime Distributions per State - {trimer2}",
    height=400,
    width=300 * n_states,
    bargap=0.1,
    margin=dict(t=50, r=20, l=20, b=40)
)

# Add axis labels uniformly
for i in range(n_states):
    fig.update_xaxes(title_text="Lifetime (frames)", row=1, col=i+1)
    fig.update_yaxes(title_text="Frequency", row=1, col=1)

fig.show()
