In [3]:
import json
import sys


def parse_topology_file(top_path: str, json_out_path: str) -> dict:
    """
    Parse AMBER/CHARMM topology file and convert to JSON.
    
    Structure:
    {
        "RESIDUE_NAME": {
            "name": str,
            "type": str ("RESI"),
            "charge": float,
            "atoms": {
                "ATOM_NAME": {
                    "name": str,
                    "type": str,
                    "charge": float
                }
            },
            "bonds": [["ATOM1", "ATOM2"], ...],
            "donors": [["H1", "N"], ...],
            "acceptors": [["O", "C"], ...],
            "impropers": [[...], ...]
        }
    }
    
    Args:
        top_path: Path to .top file
        json_out_path: Output JSON file path
    
    Returns:
        Dictionary of residues
    """
    residues = {}
    current_res = None
    res_count = 0
    
    with open(top_path, 'r') as f:
        for line_no, line in enumerate(f, 1):
            line = line.strip()
            
            # Skip empty lines and comments
            if not line or line.startswith('!') or line.startswith('*'):
                continue
            
            # === NEW RESIDUE ===
            if line.startswith('RESI '):
                # Save previous residue
                if current_res:
                    residues[current_res['name']] = current_res
                    res_count += 1
                
                parts = line.split()
                res_name = parts[1]
                res_charge = float(parts[2]) if len(parts) > 2 else 0.0
                
                current_res = {
                    "name": res_name,
                    "type": "RESI",
                    "charge": res_charge,
                    "atoms": {},        # Dict keyed by atom name ← KEY DIFFERENCE
                    "bonds": [],
                    "donors": [],
                    "acceptors": [],
                    "impropers": []
                }
            
            # Only process inside residue block
            elif current_res is None:
                continue
            
            # === ATOM ===
            elif line.startswith('ATOM '):
                parts = line.split()
                if len(parts) >= 4:
                    atom_name = parts[1]
                    atom_type = parts[2]
                    atom_charge = float(parts[3])
                    
                    current_res["atoms"][atom_name] = {
                        "name": atom_name,
                        "type": atom_type,
                        "charge": atom_charge
                    }
            
            # === BOND ===
            elif line.startswith('BOND '):
                # Extract bond string (after keyword, before comment)
                bond_str = line[5:].strip()
                if '!' in bond_str:
                    bond_str = bond_str[:bond_str.index('!')].strip()
                
                # Split into tokens and pair them
                atoms = bond_str.split()
                for i in range(0, len(atoms) - 1, 2):
                    current_res["bonds"].append([atoms[i], atoms[i+1]])
            
            # === DONOR ===
            elif line.startswith('DONOR ') or line.startswith('DONO '):
                parts = line.split()[1:]
                if parts and not parts[0].startswith('!'):
                    current_res["donors"].append(parts)
            
            # === ACCEPTOR ===
            elif line.startswith('ACCEPTOR ') or line.startswith('ACCE '):
                parts = line.split()[1:]
                if parts and not parts[0].startswith('!'):
                    current_res["acceptors"].append(parts)
            
            # === IMPROPER ===
            elif line.startswith('IMPROPER ') or line.startswith('IMPRO '):
                parts = line.split()[1:]
                if parts and not parts[0].startswith('!'):
                    current_res["impropers"].append(parts)
    
    # Save last residue
    if current_res:
        residues[current_res['name']] = current_res
        res_count += 1
    
    # Write JSON file
    with open(json_out_path, 'w') as f:
        json.dump(residues, f, indent=2)
    
    print(f"✓ Parsed {res_count} residues and saved to '{json_out_path}'")
    return residues


def print_summary(residues: dict):
    """Print summary statistics."""
    print(f"\nSummary: {len(residues)} residues")
    print("\nFirst 3 residues:")
    for name in sorted(list(residues.keys())[:3]):
        r = residues[name]
        print(f"  {name:8s} | charge={r['charge']:6.2f} | "
              f"atoms={len(r['atoms']):3d} | bonds={len(r['bonds']):3d} | "
              f"donors={len(r['donors']):2d} | acceptors={len(r['acceptors']):2d}")

In [4]:
top_file = "../data/raw/ff.top"
json_file = "topology.json"

residues = parse_topology_file(top_file, json_file)
print_summary(residues)

✓ Parsed 41 residues and saved to 'topology.json'

Summary: 41 residues

First 3 residues:
  H3O      | charge=  1.00 | atoms=  4 | bonds=  4 | donors= 0 | acceptors= 0
  HOH      | charge=  0.00 | atoms=  3 | bonds=  3 | donors= 0 | acceptors= 0
  SOD      | charge=  1.00 | atoms=  1 | bonds=  0 | donors= 0 | acceptors= 0
