In [194]:
from bs4 import BeautifulSoup, Tag
import requests
import pandas as pd
from pathlib import Path

import MDAnalysis as mda
from collections import Counter
import os



# For https://www.charmm-gui.org/?doc=archive&lib=lipid

In [2]:
def parse_lipid_from_charmm_gui1() -> pd.core.frame.DataFrame:
    """Parses lipid information from the CHARMM-GUI website and saves it as a CSV file.

    Returns
    -------
        pd.core.frame.DataFrame
            A DataFrame containing the lipid information with columns:
                - "Name": The name of the lipid.
                - "Alias": The alias of the lipid extracted from the download link.
                - "Category": The category under which the lipid is listed.
                - "Link": The download link for the lipid file.
    """
    url = "https://www.charmm-gui.org/?doc=archive&lib=lipid"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    base_url = "https://www.charmm-gui.org"
    col_names = ["Name", "Alias", "Category", "Link"]
    recordings = []
        
    rows = soup.find_all('div', id='main')
    for row in rows:
        for div in row.find_all("div"):
            for title in div.find_all("b"):
                for ul in div.find_all("ul"):
                    for li in ul.find_all('li'):
                        sterol_name = li.find("span").get_text().split()[0]
                        
                        download_link = base_url + li.find("a", href=True)["href"]
                        alias = download_link.split("/")[-1]
                        alias = alias.split('.')[0]
                        
                        recording = {"Name": sterol_name, 
                                     "Alias": alias, 
                                     "Category": title.get_text().strip(),
                                     "Link": download_link }
                        recordings.append(recording)
        
    df = pd.DataFrame(recordings)
    df.to_csv('lipid_CHARMM_GUI1.csv', sep=';', index=False, header=True, columns=col_names)
    return df

In [3]:
lipid_list = parse_lipid_from_charmm_gui1()
lipid_list

Unnamed: 0,Name,Alias,Category,Link
0,CHOLESTEROL,chl1,Sterols,https://www.charmm-gui.orgarchive/lipid/chl1.t...
1,ERGOSTEROL,erg,Sterols,https://www.charmm-gui.orgarchive/lipid/erg.ta...
2,DPOP,dpop,Sterols,https://www.charmm-gui.orgarchive/lipid/dpop.t...
3,β-SITOSTEROL,sito,Sterols,https://www.charmm-gui.orgarchive/lipid/sito.t...
4,STIGMASTEROL,stig,Sterols,https://www.charmm-gui.orgarchive/lipid/stig.t...
...,...,...,...,...
746,BDTM,bdtm,Thio Maltosides,https://www.charmm-gui.orgarchive/lipid/bdtm.t...
747,AUDTM,audtm,Thio Maltosides,https://www.charmm-gui.orgarchive/lipid/audtm....
748,BUDTM,budtm,Thio Maltosides,https://www.charmm-gui.orgarchive/lipid/budtm....
749,ADDTM,addtm,Thio Maltosides,https://www.charmm-gui.orgarchive/lipid/addtm....


# For https://www.charmm-gui.org/?doc=archive&lib=csml

In [203]:
def get_formula_res_name_from_pdb_file(link: str) -> tuple(str, str):
    """Fetches a PDB file from a given link, analyzes it to extract the formula and residue name.

    Ressources
    ----------
    https://docs.mdanalysis.org/2.0.0/documentation_pages/topology/guessers.html#guessing-elements-from-atom-names
    https://docs.mdanalysis.org/2.0.0/documentation_pages/topology/guessers.html#MDAnalysis.topology.guessers.guess_atom_type 

    Parameters
    ----------
        link: str
            The URL link to the PDB file.

    Returns
    -------
        tuple(str, str)
            A tuple containing the chemical formula and the name of the residue.
    
    Raises
    ------
        requests.exceptions.HTTPError: If an HTTP error occurs during the request.
        Exception: For other unexpected errors.
    """
    try:
        response = requests.get(link)
        response.raise_for_status()
        filename = link.split("/")[-1]
        open(filename, "wb").write(response.content)

    #In case the given link leads to nothing
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
        return None, None
    except Exception as err:
        print(f"An error occurred: {err}")
        return None, None

    #In case, the file is empty
    if os.path.getsize(filename) == 0:
        return None, None

    molecule = mda.Universe(filename)
    atom_name = []
    res_name = set()

    for atom in molecule.atoms:
        atom_name.append(f"{mda.topology.guessers.guess_atom_type(atom.name)}")
        # print(mda.topology.guessers.guess_atom_element(atom.name))
        res_name.add(atom.resname)

    atom_names = Counter(atom_name)
    sorted_atom_counts = sorted(atom_names.items())
    formula = ''.join(f"{atom}{count}" if count > 1 else atom for atom, count in sorted_atom_counts)

    #To delete the file we just download
    current_dir = Path.cwd()
    absolute_file_path = current_dir / filename
    relative_path = absolute_file_path.relative_to(current_dir)
    relative_path.unlink()

    print(f"File {filename} : {formula} // {res_name}")
    return formula, res_name.pop()

In [205]:
def parse_lipid_from_charmm_gui2() -> pd.core.frame.DataFrame:
    """Parses lipid information from the CHARMM-GUI website and saves it as a CSV file.

    Returns
    -------
        pd.core.frame.DataFrame
            A DataFrame containing the lipid information with columns:
                - "Name": The name of the lipid.
                - "Alias": The alias of the lipid extracted from the download link.
                - "Category": The category under which the lipid is listed.
                - "Link": The download link for the lipid file.
    """
    url = "https://www.charmm-gui.org/?doc=archive&lib=csml"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    base_url = "https://www.charmm-gui.org/"
    col_names = ["Category", "Alias", "Name", "View_Link", "PDB_Link", "Formula", "Res_name_PDB"]
    recordings = []
        
    rows = soup.find_all("tbody")
    for row in rows:
        category = row.get("id")
        for tr in row.find_all("tr"):
            cells = tr.find_all("td")
            if len(cells) > 1:
                alias = cells[0].text.strip()
                name = cells[1].text.strip()
                view_structure_link = f"https://www.charmm-gui.org/?doc=visualization.ngl.archive&pdb_id={alias.lower()}&arg=csml"
                download_link = base_url + cells[4].find('a')['href'] if cells[4].find('a') else None

                formula, resname_PDB = None, None
                if download_link!=None:
                    formula, resname_PDB = get_formula_res_name_from_pdb_file(download_link)


                recording = {"Category": category, 
                            "Alias": alias, 
                            "Name": name,
                            "View_Link": view_structure_link, 
                            "PDB_Link": download_link, 
                            "Formula": formula, 
                            "Res_name_PDB": resname_PDB}
                recordings.append(recording)
    df = pd.DataFrame(recordings)
    df.to_csv('lipid_CHARMM_GUI2.csv', sep=';', index=False, header=True, columns=col_names)
    return df

In [204]:
lipid_list2 = parse_lipid_from_charmm_gui2()
lipid_list2 #12min

File alad.pdb : C4CL2H12N2O2 // {'ALA'}
File aall.pdb : C6H12O6 // {'AAL'}
File aalt.pdb : C6H12O6 // {'AAL'}
File aarb.pdb : C5H10O5 // {'AAR'}
File abeq.pdb : C6H12O4 // {'ABE'}
File adeo.pdb : C5H10O4 // {'ADE'}
File afru.pdb : C6H12O6 // {'AFR'}
File afuc.pdb : C6H12O5 // {'AFU'}
File agal.pdb : C6H12O6 // {'AGA'}
File agalna.pdb : C8H15NO6 // {'AGA'}
File aglc.pdb : C6H12O6 // {'AGL'}
File aglca.pdb : C6H9O7 // {'AGL'}
File aglcna.pdb : C8H15NO6 // {'AGL'}
File agul.pdb : C6H12O6 // {'AGU'}
File aido.pdb : C6H12O6 // {'AID'}
File aidoa.pdb : C6H9O7 // {'AID'}
File allose.pdb : C6H12O6 // {'ALL'}
File alyf.pdb : C5H10O5 // {'ALY'}
File aman.pdb : C6H12O6 // {'AMA'}
File ane5ac.pdb : C11H18NO9 // {'ANE'}
File arhm.pdb : C6H12O5 // {'ARH'}
File arhmoa.pdb : C8H14O6 // {'ARH'}
File arib.pdb : C5H10O5 // {'ARI'}
File atal.pdb : C6H12O6 // {'ATA'}
File axyf.pdb : C5H10O5 // {'AXY'}
File axyl.pdb : C5H10O5 // {'AXY'}
File ball.pdb : C6H12O6 // {'BAL'}
File balt.pdb : C6H12O6 // {'BAL'}
F



File alf4.pdb : ALF4 // {'ALF'}
File amcp.pdb : C4H10N // {'AMC'}
File amdn.pdb : C2H7N2 // {'AMD'}
File amet.pdb : C2H5N // {'AME'}
File amm1.pdb : H3N // {'AMM'}
File amol.pdb : C3CAH7O4 // {'AMO'}
File amop.pdb : C3CAH7O3 // {'AMO'}
File antr.pdb : C14H10 // {'ANT'}
File aobt.pdb : C7CSH12N2O2S // {'AOB'}
File apnh.pdb : C7H12N2O2 // {'APN'}
File ar3p.pdb : C4H7O6P // {'AR3'}
File arao.pdb : C4H8O3 // {'ARA'}
File arim.pdb : C7H10N2O2 // {'ARI'}
File armo.pdb : C5H10O3 // {'ARM'}
HTTP error occurred: 404 Client Error: Not Found for url: https://www.charmm-gui.org/archive/csml/asbb.pdb
File azdo.pdb : C3H5NO // {'AZD'}
File azul.pdb : C10H8 // {'AZU'}
File b2fo.pdb : C4H4O2 // {'B2F'}
File b5np.pdb : C8H14O5P // {'B5N'}
File b5sp.pdb : C8H14O5P // {'B5S'}
File bab1.pdb : C29H47N2O6 // {'BAB'}
File bab2.pdb : C29H47N2O6 // {'BAB'}
File bald.pdb : C7H6O // {'BAL'}
File bam1.pdb : C13H24 // {'BAM'}
File bami.pdb : C7H9N2 // {'BAM'}
File bben.pdb : C10H14 // {'BBE'}
File bca.pdb : C14H8O



File lit.pdb : LI // {'LIT'}
File mg.pdb : MG // {'MG'}
File oh.pdb : HO // {'OH'}
File pot.pdb : K // {'POT'}




File rub.pdb : R // {'RUB'}
File sod.pdb : NA // {'SOD'}
File tip3.pdb : H2O // {'TIP3'}
File tp3m.pdb : H2O // {'TP3'}
File zn2.pdb : ZN // {'ZN2'}
File mgu1.pdb : C2H7N3 // {'MGU'}
File mgu2.pdb : C2H7N3 // {'MGU'}
File mgu3.pdb : C2H7N3 // {'MGU'}
File mgun.pdb : C2H7N3 // {'MGU'}
File pgun.pdb : C4H11N3 // {'PGU'}
File aldd.pdb : C4CL2H12N2O2 // {'ALD'}
File dfet.pdb : C2F2H4 // {'DFE'}
File feth.pdb : C2FH5 // {'FET'}
File tfe.pdb : C2F3H3O // {'TFE'}
File tfet.pdb : C2F3H3 // {'TFE'}
File co.pdb : CO // {'CO'}
File co2.pdb : CO2 // {'CO2'}
File heme.pdb : C34FEH30N3NAO4 // {'HEM'}
HTTP error occurred: 404 Client Error: Not Found for url: https://www.charmm-gui.org/archive/csml/mes2.pdb
File o2.pdb : O2 // {'O2'}
File bdfd.pdb : C7F2H5O3P // {'BDF'}
File bdfp.pdb : C7F2H6O3P // {'BDF'}
File bmpd.pdb : C7H7O3P // {'BMP'}
File bmph.pdb : C7H8O3P // {'BMP'}
File ep_2.pdb : C2H5O4P // {'EP_'}
File ip_2.pdb : C3H7O4P // {'IP_'}
File nucl.pdb : C9H14N2O6P // {'NUC'}
File pph1.pdb : C6H6



File c2n.pdb : C3CLH6NO2P // {'C2N'}
File fla.pdb : C3F3H4NO2 // {'FLA'}
File nmglyd.pdb : C7H14N2O2 // {'NMG'}
File sm014.pdb : C6CL2FH13N2O2 // {'SM0'}
File sm023.pdb : C6CL2H12N2O2 // {'SM0'}
File sm032.pdb : C6H12N2O2 // {'SM0'}
File sm035.pdb : C3CL2H9NO // {'SM0'}
File sm056.pdb : C5H9N2O3 // {'SM0'}
File sm070.pdb : C5CL2H9N2O4 // {'SM0'}
File sm071.pdb : C5CL2H12N2O2 // {'SM0'}
File sm076.pdb : C4CL2H9N2O4 // {'SM0'}
File sm078.pdb : C5H10N2O2 // {'SM0'}
File sm095.pdb : C5CL2H15NO // {'SM0'}
File sm114.pdb : C5CL2H14N2O2 // {'SM1'}
File sm115.pdb : C2CL2H9NO // {'SM1'}
File sm119.pdb : C7CL2H13N3O2 // {'SM1'}
File sm120.pdb : C8H11N3O2 // {'SM1'}
File sm136.pdb : C7H14N2O // {'SM1'}
File sm138.pdb : C5CL2H12N2O2S // {'SM1'}
File sm140.pdb : C6CL2H14N2O3 // {'SM1'}
File sm154.pdb : C7CL2H13N3O4 // {'SM1'}
File sm155.pdb : C5CL2H10N2O3 // {'SM1'}
File sm190.pdb : C7H12N2O2 // {'SM1'}
File sm191.pdb : C9H15N2O4 // {'SM1'}
File sm192.pdb : C11H14N2O2 // {'SM1'}
File sm195.pdb : C8



Unnamed: 0,Category,Alias,Name,View_Link,PDB_Link,Formula,Res_name_PDB
0,top_all36_prot.rtf_tbody,ALA,alanine,https://www.charmm-gui.org/?doc=visualization....,,,
1,top_all36_prot.rtf_tbody,ALAD,alanine dipeptide,https://www.charmm-gui.org/?doc=visualization....,https://www.charmm-gui.org/archive/csml/alad.pdb,C4CL2H12N2O2,{ALA}
2,top_all36_prot.rtf_tbody,ARG,arginine,https://www.charmm-gui.org/?doc=visualization....,,,
3,top_all36_prot.rtf_tbody,ASN,asparagine,https://www.charmm-gui.org/?doc=visualization....,,,
4,top_all36_prot.rtf_tbody,ASP,aspartic acid,https://www.charmm-gui.org/?doc=visualization....,,,
...,...,...,...,...,...,...,...
2164,toppar_all36_label_spin.str_tbody,R5,SPIN LABEL FOR PROTEINS CHI2POT = 0.04066,https://www.charmm-gui.org/?doc=visualization....,https://www.charmm-gui.org/archive/csml/r5.pdb,C10H20NOS2,{R5}
2165,toppar_all36_label_spin.str_tbody,R5U,SPIN LABEL FOR PROTEINS CHI2POT = 0.04066,https://www.charmm-gui.org/?doc=visualization....,https://www.charmm-gui.org/archive/csml/r5u.pdb,C10H20NOS2,{R5U}
2166,toppar_all36_label_spin.str_tbody,SLH,SPIN LABEL FOR LIPID HEADGROUP CHI2POT = 0.04202,https://www.charmm-gui.org/?doc=visualization....,https://www.charmm-gui.org/archive/csml/slh.pdb,,
2167,toppar_all36_label_spin.str_tbody,SLT,SPIN LABEL FOR LIPID TAILS CHI2POT = 0.04376,https://www.charmm-gui.org/?doc=visualization....,https://www.charmm-gui.org/archive/csml/slt.pdb,,
