# Overview of the SN25 pipelines for Product

In [18]:
import bittensor as bt
from typing import List, Dict
import requests

import pandas as pd 


# This is a plot of the last part of their simulation but we will need the entire curve. We don't have this yet. 
import plotly.graph_objects as go
import numpy as np

In [19]:
# necessary preprocessing pipelines 

def response_to_dict(response) -> List[Dict]:
    response = response.json()["results"][0]
    if "error" in response.keys():
        raise ValueError(f"Failed to get all PDBs: {response['error']}")
    elif "values" not in response.keys():
        return {}
    columns = response["columns"]
    values = response["values"]
    data = [dict(zip(columns, row)) for row in values]
    return data

# data cols that are present in the GJP
data_columns = [
    "id", 
    "pdb_id", 
    "system_config", 
    "s3_links", 
    "priority", 
    "hotkeys", 
    "is_organic", 
    "active", 
    "update_interval", 
    "max_time", 
    "epsilon", 
    "min_updates", 
    "updated_at", 
    "best_loss", 
    "best_loss_at", 
    "best_hotkey", 
    "updated", 
    "created_at", 
    "best_cpt_links", 
    "job_type", 
    "event", 
    "validator_hotkey", 
    "job_id", 
    "computed_rewards"
]

## Things have changed 

We have changed since the last dashboard but the data is more or less the same, just comes in a different format. We should be entirely reliant on the Global Job Pool (GJP), which is a centralized db that automatically syncs across all miners and validators. 

## The Global Job Pool
Properies: 
1. Centrally hosted by us

    a. Production db: "174.138.3.61:4001"

    b. Test db: "167.99.209.27:4001"

2. queryable using simple http requests 

I will break down the major sections on Tizi's dashboard with the code that is necessary to get the represenative data. 


![title](dashboard/proteins_folded.png)

In [33]:
# Here you can get all jobs from the GJP, but you just need to choose the right db. 
GJP_ADDRESS = "174.138.3.61:4001" #prod 
# GJP_ADDRESS = "167.99.209.27:4001" #test 

response = requests.get(
    f"http://{GJP_ADDRESS}/db/query",
    params={
        "q": f"SELECT * FROM jobs"
    },
)


In [34]:
data = response_to_dict(response)
print(len(data))

226


In [35]:
data[2]

{'id': 3,
 'pdb_id': '2ajn',
 'system_config': '{"ff": "amber14-all.xml", "box": "cube", "water": "amber14/tip3pfb.xml", "system_kwargs": {"friction": 1.05, "temperature": 323.32}}',
 's3_links': '{"cpt": "https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/2ajn/5HYk8DMK/2025-03-31_13-58-47/em.cpt", "pdb": "https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/2ajn/5HYk8DMK/2025-03-31_13-58-47/2ajn.pdb"}',
 'priority': '1',
 'hotkeys': '["5HN5q9FaM6vPwnqptbXaiLFtzd8bYa4VyNwJt96fyWgYpt1x", "5F6aRdsBHajN2NhZHBTB6ibBFu7YuZZEWruWzB8x6B6GiZ4D", "5Dcj9Vp77t55fYC3o1aoH1HM2wVP4TSXReqfwSV343t7P8LT", "5Fy3MjrdKRvUWSuJa4Yd5dmBYunzKNmXnLcvP22NfaTvhQCY", "5HYficC7RnNWdqqo7cLToUyaWYsaf2KwkMEgZnfxaC1uRZH4", "5G3onMPUu1PiXcAFtVqe5rduPJgejG3CJB6DL29h8qdkAadA", "5EUF8Ydyj87cggYZxvANwEPNDtrZdaF533AzWkJr5kyFSuhg", "5E7u7wM5iskNjhvzP7WJ8vbyLjKmQeBEucmyUPycYSuwj7P6", "5E9rJadEwvYkn75xcznoysgJtuk33xRuU8QP2BuTsGT9qcgG", "5HKMcnD5iPDBvfL3HXiaEpN4J6T2Ro8wE2GjDzmVKqEYGapN", "5HU3t5NcoezYQaC

As you can see, there are some time columns that you need: 
1. created_at 
2. updated_at --> this is the time that the job was actually closed, since RIGHT NOW we only do only one update. 

In practice, we should also have a "time_closed" param 

But using the above information, you should have what you need to make the above plot. 

![title](dashboard/gjp.png)

You're going to use the same query as above, but you could segement them by job status and merge if you want

In [36]:
response_active = requests.get(
    f"http://{GJP_ADDRESS}/db/query",
    params={
        "q": f"SELECT * FROM jobs WHERE active = 1"
    },
)

response_inactive = requests.get(
    f"http://{GJP_ADDRESS}/db/query",
    params={
        "q": f"SELECT * FROM jobs WHERE active = 0"
    },
)

data_active = response_to_dict(response_active)
data_inactive = response_to_dict(response_inactive)



In [37]:
data_active[-1]

{'id': 226,
 'pdb_id': '2bc8',
 'system_config': '{"ff": "charmm36.xml", "box": "cube", "water": "charmm36/water.xml", "system_kwargs": {"friction": 0.92, "temperature": 335.0}}',
 's3_links': '{"cpt": "https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/2bc8/5F4tQyWr/2025-04-02_00-30-43/em.cpt", "pdb": "https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/2bc8/5F4tQyWr/2025-04-02_00-30-43/2bc8.pdb"}',
 'priority': '1',
 'hotkeys': '[""]',
 'is_organic': '0',
 'active': '1',
 'update_interval': '11241',
 'max_time_no_improvement': '1',
 'epsilon': '1.0',
 'min_updates': '1',
 'updated_at': '2025-04-02T00:30:51',
 'best_loss': '100000000000000.0',
 'best_loss_at': '0001-01-01T00:00:00',
 'best_hotkey': '',
 'updated_count': '0',
 'created_at': '2025-04-02T00:30:51',
 'best_cpt_links': None,
 'job_type': 'SyntheticMD',
 'event': '{"box": "cube", "epsilon": 1, "ff": "charmm36.xml", "hp_sample_time": 18.099489450454712, "hp_tries": 0, "init_energy": -35250.7963170179

In [38]:
# Importantly, the KEY column variable to know the job TYPE (organic or synthetic) is job_type.
# Right now, we only have SyntheticMD in the GJP, but eventually we will have more (OrganicMD) 

print(data_active[0]["job_type"])

SyntheticMD


![title](dashboard/completed_tasks.png)

In [39]:
# you will be able to access the pdb files using the s3_links column. 
import json

s3_links = json.loads(data_inactive[0]["s3_links"])
print(s3_links)
print(s3_links["pdb"]) # this is the one you need. 

{'cpt': 'https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/1yxr/5HYk8DMK/2025-03-31_13-57-18/em.cpt', 'pdb': 'https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/1yxr/5HYk8DMK/2025-03-31_13-57-18/1yxr.pdb'}
https://nyc3.digitaloceanspaces.com/sn25-folding-mainnet/inputs/1yxr/5HYk8DMK/2025-03-31_13-57-18/1yxr.pdb


In [40]:
data_inactive[0]["event"]

'{"best_cpt": ["", "", "", "/home/ubuntu/folding/data/1yxr/5Dyv2T3R/md_0_1_old.cpt", ""], "block": 5249844, "box": "cube", "checked_energy": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, [-302935.0094707416, -302639.5962412626, -302735.4712121861, -302953.7007446001, -302264.0444120374, -302921.6811376186, -301979.8771782007, -302277.51207125734, -302808.8773502619, -303030.70163857134, -302944.1748522695, -302722.371177393, -302484.03820947465, -302498.56518408866, -302788.49207382184, -302315.16452659416, -303276.65925520856, -303107.8775322455, -302981.76582503243, -302903.3377526503, -303188.3910690468, -302988.86965750006, -303383.3548369606, -302274.9291888563, -303287.3363222218, -303235.1484155728, -303131.224109049, -303515.0352381334, -303089.1238436369, -303530.5639466479, -303341.87227850925, -303588.94743142894, -302616.98860365537, -302728.6973886322, -302470.2821264464, -302952.2458933016, -302873.8316413905, -303363.6415334599, -302543.1061163111, -302736.1289402758, -302371.102782340

In [43]:
# Miner energies can be found 
event = json.loads(data_inactive[0]["event"])
event.keys()

dict_keys(['best_cpt', 'block', 'box', 'checked_energy', 'energies', 'epsilon', 'ff', 'hp_sample_time', 'hp_tries', 'init_energy', 'input_source', 'is_duplicate', 'is_run_valid_time', 'is_valid', 'job_type', 'md_inputs', 'md_inputs_sizes', 'miner_energy', 'ns_computed', 'pdb_complexity', 'pdb_id', 'process_md_output_time', 'reason', 'reported_energy', 'response_returned_files', 'response_returned_files_sizes', 'response_status_codes', 'response_status_messages', 'response_times', 'rmsds', 's3_links', 'seed', 'step_length', 'system_kwargs', 'uid_search_time', 'uids', 'validator_search_status', 'water'])

In [45]:
# Here you will get: 
# 1. energies, which is the miner's final energy
# 2. miner_energy curve, which is the energy of the miner at each step (miner reported)
# 3. checked_energy curve, which is the energy of the miner at each step (validator reprod)
# 4. computed_rewards, which is the reward of the miner at each step (validator distributed)

sorted_data = sorted(
    zip(event["energies"], event["miner_energy"], event["checked_energy"], json.loads(data_inactive[0]["computed_rewards"])),
    key=lambda x: x[3], # sort by computed rewards
    reverse=True
)

# This is important because you will also need the top K results to plot the top K results in an other panel. 
sorted_data

[(-304013.42446201446,
  [-303068.7811164282,
   -302925.7873032392,
   -302745.6511405377,
   -303283.9474764734,
   -302826.56689027965,
   -303405.3732351778,
   -302589.5023686608,
   -303256.5320554704,
   -303594.06518998515,
   -303624.49934287346,
   -303678.66916431783,
   -302875.4421088337,
   -303862.8738857659,
   -303737.8385735652,
   -303777.76027569326,
   -303043.4333860837,
   -302974.9587267325,
   -303206.8391935121,
   -302865.5526187601,
   -303309.4326147946,
   -302511.8742842072,
   -302831.3731274162,
   -302666.65952819807,
   -303333.4654638876,
   -303149.4858570973,
   -303557.7413332152,
   -302142.5129017747,
   -302152.47718912084,
   -303364.30362666846,
   -303614.1207268747,
   -302897.78115496645,
   -302974.71808980417,
   -303085.8399040642,
   -302936.6709241024,
   -302805.82112130045,
   -303089.60729229136,
   -302615.11381897243,
   -303322.8787637515,
   -302941.9470565173,
   -303494.11339378974,
   -302591.63631362066,
   -302900.63454197

In [46]:
print(sorted_data[0][0]) # THIS IS THE FINAL MINER ENERGY FOR THE BEST MINER. 
print(sorted_data[1][0]) # THIS IS THE FINAL MINER ENERGY FOR THE SECOND BEST MINER. 
#....

-304013.42446201446
-303739.85838806594


![title](dashboard/energy_vs_time.png)

In [47]:
# This is an example of plotting the miner and checked energy curves for the FIRST (best) miner. 
example_label_data = {key:sorted_data[0][ii]  for ii, key in enumerate(["final_miner_energy", "miner_energy_curve", "checked_energy_curve", "computed_rewards"])}

fig = go.Figure()
# Add each line as a separate trace
fig.add_trace(go.Line(x = np.arange(len(example_label_data["miner_energy_curve"])), y = example_label_data["miner_energy_curve"], name='Miner Energy'))
fig.add_trace(go.Line(x = np.arange(len(example_label_data["checked_energy_curve"])), y = example_label_data["checked_energy_curve"], name='Checked Energy'))
fig.update_layout(title="Energy Curves for a SMALL SEGMENT OF THE TOTAL MINER ENERGY CURVE", xaxis_title="Step", yaxis_title="Energy")



plotly.graph_objs.Line is deprecated.
Please replace it with one of the following more specific types
  - plotly.graph_objs.scatter.Line
  - plotly.graph_objs.layout.shape.Line
  - etc.




## important: The above energy curve is simply a short REPRODUCED segement vs the miner's self reported energy. This is the reproduced energy lines that Tizi has

### We need to start saving the miner energy logs.... I thought we did but looks like im mistaken 

## Submission to the GJP. 
There are several considerations: 

1. The data that goes into the GJP must conform to the FoldingParams class, which is created inside of https://github.com/macrocosm-os/gjp-models and recreated inside of the folding_api.schemas folder 

2. The endpoint is requests.post(
            f"http://{address}/organic" ) 


3. You have to use Epistula, which is a cryptographic signature manager for secure message handling. It is signed by the users bittensor coldkey

In [49]:
from atom.epistula.epistula import Epistula
from folding_api.schemas import FoldingParams

In [1]:
# Easiest thing to try is to submit to the test GJP -- but I think you need the proper credentials.
folding_params = FoldingParams(
    pdb_id=pdb_id,  # Ensure it's a string
    source="rcsb",  # Default to RCSB source
    ff=forcefield,
    water=water_model,
    box=box_shape,
    temperature=temperature,
    friction=friction,
    epsilon=1.0,  # Default epsilon value
    priority=priority,  # Add priority parameter
)

def make_request(
    address: str, folding_params: FoldingParams
) -> requests.Response:
    try:
        # Convert params to JSON and encode
        body_bytes = json.dumps(
            folding_params.model_dump(), default=str, sort_keys=True
        ).encode("utf-8")

        # Generate headers using Epistula
        epistula = Epistula()
        headers = epistula.generate_header(
            wallet.hotkey, body_bytes
        )

        # Make the request with timeout
        response = requests.post(
            f"http://{address}/organic",
            data=body_bytes,
            headers=headers,
            timeout=30,  # Add timeout
        )
        response.raise_for_status()  # Raise exception for bad status codes
        return response
    except requests.exceptions.Timeout:
        raise TimeoutError("Request timed out. Please try again.")
    except requests.exceptions.RequestException as e:
        raise ConnectionError(f"Failed to connect to server: {str(e)}")

# Make the request
response = make_request(
    address=API_ADDRESS, folding_params=folding_params
)

NameError: name 'FoldingParams' is not defined

## Current Searchable DB 

In [50]:
from folding.utils.ops import load_pkl

In [55]:
# these are pdbs that are saved locally. 
pkl_path = "./pdb_ids.pkl"
pdb_ids = load_pkl(pkl_path)


In [66]:
all_pdbs = [] 
for source_name, values in pdb_ids.items(): 
    all_pdbs.extend(values['pdbs'])

len(all_pdbs)

224572

In [71]:
# when it comes to actually downloading the pdbs, you can use the following code: 
import os
from folding.utils.ops import check_and_download_pdbs

path_to_save = "./pdbs"
os.makedirs(path_to_save, exist_ok=True)

#select the pdb id from the list of all pdbs, select the first one for now 
pdb_id = all_pdbs[1]
input_source = "rcsb" # but you can also use "pdbe", it is indicated in source_name

await check_and_download_pdbs(pdb_directory=path_to_save, pdb_id=pdb_id + ".pdb", input_source=input_source, force=True)

True