In [1]:
from bs4 import BeautifulSoup
import requests

# Retrieve an HTML page

In [239]:
url = "https://mad.ibcp.fr/explore"
response = requests.get(url)

In [240]:
# look at the status of the response, if it worked or not
# 200: it works // 404: this page doen't exist // 500: server encountered an error
?response

[0;31mType:[0m        Response
[0;31mString form:[0m <Response [200]>
[0;31mFile:[0m        ~/miniconda3/envs/grodecoder-env/lib/python3.11/site-packages/requests/models.py
[0;31mDocstring:[0m  
The :class:`Response <Response>` object, which contains a
server's response to an HTTP request.

In [241]:
# retrieve the page's content
response.content

b'<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="MArtini Database"/><link rel="apple-touch-icon" href="/apple-touch-icon.png"/><link rel="stylesheet" href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&display=swap"/><link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons"/><script src="/all.min.js" defer="defer" async></script><title>MAD - MArtini Database - Database for coarse grained biomolecules</title><meta name="keywords" content="MAD, MArtini Database, Martinize, CGMartini, INSANE, Coarse grained"><meta name="title" content="MArtini Database"><meta name="description" content="MArtini Database references high quality coarse grained models and let you upload yours."><meta property="og:type" content="website"><meta property="og:url" content="h

# Parse the page's content

In [242]:
# convert this HTML code into something more readable
soup = BeautifulSoup(response.content, "html.parser")
type(soup)

bs4.BeautifulSoup

In [243]:
# display HTML code with indentation
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="/favicon.ico" rel="icon"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <meta content="MArtini Database" name="description"/>
  <link href="/apple-touch-icon.png" rel="apple-touch-icon"/>
  <link href="https://fonts.googleapis.com/css?family=Roboto:300,400,500,700&amp;display=swap" rel="stylesheet"/>
  <link href="https://fonts.googleapis.com/icon?family=Material+Icons" rel="stylesheet"/>
  <script async="" defer="defer" src="/all.min.js">
  </script>
  <title>
   MAD - MArtini Database - Database for coarse grained biomolecules
  </title>
  <meta content="MAD, MArtini Database, Martinize, CGMartini, INSANE, Coarse grained" name="keywords"/>
  <meta content="MArtini Database" name="title"/>
  <meta content="MArtini Database references high quality coarse grained models and let you upload yours." name="description"/>
  <meta content=

In [244]:
methods = ", ".join([method for method in dir(soup)
                     if not method.startswith("_")])
print(methods)  # All the methods that can be use on BeautifulSoup object



On peut voir qu'il y en a beaucoup. On va se concentrer sur les méthodes ```.find()``` et ```.find_all()``` qui permettent de récupérer des éléments HTML en fonction de leur nom, de leur classe, de leur id, et la méthode ```.get_text()``` qui permet de récupérer le texte contenu dans un élément HTML.

1. ***Méthode ```.find_all(name, attrs, recursive, string, limit, **kwargs)```*** permet de rechercher et de récupérer tous les éléments HTML correspondant à certains critères. Elle renvoie une liste d'objets BeautifulSoup contenant tous les éléments trouvés. <br>
Paramètres : </br>
- ```name``` (facultatif) : Le nom de la balise HTML à rechercher. (Exemple : ```name="title"```)
- ```attrs``` (facultatif) : Un dictionnaire contenant des attributs et leurs valeurs pour filtrer les éléments. (Exemple : ```attrs={"class": "my-class", "id": "my-id"}```)
- ```recursive``` (facultatif) : Un booléen qui indique si la recherche doit être effectuée dans les sous-éléments. (Exemple : ```recursive=False```)
- ```string``` (facultatif) : Un filtre sur le contenu textuel des éléments. (Exemple : ```string="my-text"```)
- ```limit``` (facultatif) : Un entier qui indique le nombre maximum d'éléments à renvoyer. (Exemple : ```limit=5```)
- ```**kwargs``` (facultatif) : Des filtres supplémentaires sur les attributs. (Exemple : ```class_="my-class", id="my-id"```)

2. ***Méthode ```.find(name, attrs, recursive, string, **kwargs)```*** permet de rechercher et de récupérer le premier élément HTML correspondant à certains critères, tels que le nom de la balise, la classe, l'ID, ou d'autres attributs. Elle renvoie un objet BeautifulSoup correspondant au premier élément trouvé.

3. ***Méthode ```.get_text(separator, strip, types)```*** permet de récupérer le texte contenu dans un élément HTML.<br>
Paramètres :</br>
- ```separator``` (facultatif) : Le séparateur à utiliser entre les éléments. (Exemple : ```separator=" "```)
- ```strip``` (facultatif) : Un booléen qui indique si les espaces doivent être supprimés. (Exemple : ```strip=True```)
- ```types``` (facultatif) : Une liste de types d'éléments à inclure. (Exemple : ```types=["p", "a"]```)

# Saw that the previous method can't read JS script from URL, so we must try an other method
https://www.zenrows.com/blog/playwright-scraping#what-is-playwright  
https://oxylabs.io/blog/playwright-web-scraping  
https://medium.com/thedevproject/how-to-scrape-javascript-heavy-sites-like-a-pro-with-python-1ecf6f829538  
  
Use the package async_playwright to load JavaScript content from a website (with the entire script), by run it into a browser. Where BeautifulSoup only parse HTML and XML code. 

In [245]:
import asyncio  # https://docs.python.org/3/library/asyncio.html
from playwright.async_api import async_playwright  # imports the asynchronous version of Playwright, a library used to automate browsers.
from bs4 import BeautifulSoup  # parse HTML and XML documents.

In [246]:
# async def: indicates that this function will be executed asynchronously
async def scrape_with_playwright(url): 
    async with async_playwright() as p:  # launch Playwright asynchronously. This ensures that resources are properly freed once the operation is complete.
        # await: indicates that this operation is asynchronous and that the function will wait for the browser to be launched before continuing.
        browser = await p.chromium.launch(headless=True)  # Start browser. Can change 'chromium' by "firefox" or "webkit".
        page = await browser.new_page()  # Opens a new page in the launched browser
        
        # Load the page at the specified URL. This operation is asynchronous and will wait until the page is completely loaded.
        await page.goto(url)
        
        # Wait for JS content to load before continuing
        await page.wait_for_selector('tbody')
        
        # Retrieves the all the HTML content from the page after JavaScript execution
        html_content = await page.content()
        
        # Close the browser. This operation is asynchronous and will wait until the browser is completely closed.
        await browser.close()
        return html_content


In [247]:
async def main(url):
    html_content = await scrape_with_playwright(url)
    
    # Analyze the HTML content 
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

In [248]:
url = 'https://mad.ibcp.fr/explore'
HTML_content = await main(url)

In [249]:
# Look how the first line is organize 
rows = HTML_content.find("tbody").find_all("tr")
for row in rows:
    cells = row.find_all("td")
    for cell in cells:
        print(cell)
        print(cell.get_text(), '\n')
    break

<td class="MuiTableCell-root MuiTableCell-body MuiTableCell-sizeSmall"><a class="jss38" href="/molecule/THC?version=1112458835848635523"> tetrahydrocannabinol </a></td>
 tetrahydrocannabinol  

<td class="MuiTableCell-root MuiTableCell-body MuiTableCell-sizeSmall"><a class="jss38" href="/molecule/THC?version=1112458835848635523">THC</a></td>
THC 

<td class="MuiTableCell-root MuiTableCell-body MuiTableCell-alignRight MuiTableCell-sizeSmall"><div class="">Small molecule</div></td>
Small molecule 

<td class="MuiTableCell-root MuiTableCell-body MuiTableCell-alignRight MuiTableCell-sizeSmall"><div class="">2024-03-14 22:37</div></td>
2024-03-14 22:37 



# Collect everything in a function, to automate the process

In [250]:
import pandas as pd
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests

In [251]:
async def scrape_with_playwright(url: str) -> str: 
    """Scrapes the HTML content of a webpage using Playwright.

    Parameters
    ----------
        url: str
            The URL of the webpage to be scraped.

    Returns
    -------
        str
            The HTML content of the specified webpage.
    """
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        await page.goto(url)
        await page.wait_for_selector('tbody')
        html_content = await page.content()
        await browser.close()
        
        return html_content


In [252]:
async def main(url: str) -> BeautifulSoup:
    """Fetches and parses the HTML content of a webpage using Playwright and BeautifulSoup.

    Parameters
    ----------
        url: str
            The URL of the webpage to be scraped.

    Returns
    -------
        BeautifulSoup
            A BeautifulSoup object representing the parsed HTML content of the webpage.
    """
    html_content = await scrape_with_playwright(url)
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup

In [253]:
def is_max(soup: BeautifulSoup) -> bool:
    """Check if the current page is the last page of results.

    Parameters
    ----------
        soup: BeautifulSoup)
            A BeautifulSoup object containing the parsed HTML
            of the current page.

    Returns
    -------
        bool
            True if the current page is the last page of results, False otherwise.
    """
    rows = soup.find("tfoot").find_all("tr")
    for row in rows:
        cell = row.find("p")
        current_page = cell.get_text()
        tmp = current_page.split()
        tot_elmt_now = int(tmp[0].split('-')[1])
        tot_elmt = int(tmp[-1])           
        return tot_elmt_now == tot_elmt

In [254]:
def parse_MAD_one(page: int, recordings: list[dict[str, str]]) -> list[dict[str, str]]:
    """Parse data from the MAD website for a single page.

    Parameters
    ----------
        page: int
            The page number to parse.
        recordings: list[dict[str, str]]
            A list of dictionaries containing previously parsed recordings.

    Returns
    -------
        list[dict[str, str]]
            The updated list of recordings after parsing the specified page.
    """
    url = f"https://mad.ibcp.fr/explore?page={page}"
    loop = asyncio.get_event_loop()
    soup = loop.run_until_complete(main(url))
    
    rows = soup.find("tbody").find_all("tr")
    base_url = "https://mad.ibcp.fr"
    col_names = [col.get_text() for col in soup.select("th")]
    col_names.append("Lien")
    
    for row in rows:
        cells = row.find_all("td")
        link = row.find("a")
        link_href = base_url + link.get("href")
        recording = {name: value.get_text(strip=True)
                        for name, value in zip(col_names, cells)
                        if name != "Created at"}
        recording["Lien"] = link_href
        recordings.append(recording)
        
    if not is_max(soup): parse_MAD_one(page+1, recordings)
    return recordings

In [255]:
def parse_MAD_loop() -> pd.core.frame.DataFrame:
    """Parse data from the MAD website for multiple pages using a loop.

    Returns
    -------
        pd.core.frame.DataFrame
            DataFrame containing the parsed data from the MAD website.
    """
    recordings = []
    data = parse_MAD_one(1, recordings)
    df = pd.DataFrame(data)
    df.to_csv('lipid_MAD.csv', sep=';', index=False, header=True, columns=df.columns.tolist())
    return df

In [256]:
lipid_data = parse_MAD_loop()
lipid_data # en 1m30s

Unnamed: 0,Name,Alias,Category,Lien
0,tetrahydrocannabinol,THC,Small molecule,https://mad.ibcp.fr/molecule/THC?version=11124...
1,N-arachidonyl-ethanolamide,AEA,Small molecule,https://mad.ibcp.fr/molecule/AEA?version=11124...
2,C60 fullerene model,F16,Synthetic nanoparticles,https://mad.ibcp.fr/molecule/F16?version=66864...
3,F216 fullerene model,F216,Synthetic nanoparticles,https://mad.ibcp.fr/molecule/F216?version=6686...
4,F576 fullerene model,F576,Synthetic nanoparticles,https://mad.ibcp.fr/molecule/F576?version=6686...
...,...,...,...,...
398,beta-Lactose,LAC,Sugars,https://mad.ibcp.fr/molecule/LAC?version=82073...
399,Sucrose,SUCR,Sugars,https://mad.ibcp.fr/molecule/SUCR?version=8207...
400,Trehalose,TREH,Sugars,https://mad.ibcp.fr/molecule/TREH?version=8207...
401,Monosialodihexosylganglioside,DPG3,Lipids,https://mad.ibcp.fr/molecule/DPG3?version=8389...
