In [26]:
import json
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import os
import stat
from bs4 import Tag
import lxml


def load_credentials():
    """The file must contain a JSON object with the following structure:
    {
        "headers": {
            "User-Agent": "your_user_agent"
        },
        "cookies": {
            "cookie_name": "cookie_value"
        }
    }
    """
    with open("account.json") as f:
        return json.load(f)


def make_request(url: str):
    credentials = load_credentials()

    res = requests.get(
        url, headers=credentials["headers"], cookies=credentials["cookies"]
    )

    res.raise_for_status()

    return res


def save_to_file(content: str | bytes, filename: str, executable=False):
    with open(filename, "w") as f:
        f.write(content)

    if executable:
        file = os.stat(filename)

        os.chmod(filename, file.st_mode | stat.S_IXUSR)


def download_image(url: str, filename: str, force_download=False):

    file = os.path.join("images", filename)

    if os.path.exists(file) and not force_download:
        return

    res = requests.get(url)

    res.raise_for_status()

    with open(file, "wb") as f:
        f.write(res.content)


def create_nested_directories(directories: str):
    os.makedirs(directories, exist_ok=True)

In [2]:
res = make_request("https://intranet.hbtn.io/projects/2275")

In [32]:
import mdformat


def create_readme(soup: BeautifulSoup):
    container_desc = soup.find("div", {"id": "project-description"})

    image_tags: list[Tag] = soup.find_all("img")

    if len(image_tags) > 0:
        if not os.path.exists("images"):
            os.mkdir("images")

        for tag in image_tags:
            src = tag.get("src")

            filename = src.split("/")[-1].split("?")[0]

            download_image(src, filename)

            tag["src"] = filename

            tag.attrs["src"] = filename

    file = 'README.md'

    save_to_file(md(container_desc.prettify()), file)

    mdformat.file(file)

In [36]:
def create_task_files(tasks: list[Tag]):
    for task in tasks:
        task_title = task.find("h3").text
        task_content = task.find("div", {"class": "panel-body"}).prettify()

        repo_spec = task.find("div", {"class": "list-group-item"})

        requirements = repo_spec.find_all("li")

        filename = ""
        directory = ""

        for req in requirements:
            if "Directory:" in req.text:
                directory = req.text.split(":")[1].strip()

            if "File:" in req.text:
                filename = req.text.split(":")[1].strip()

        if not directory:
            directory = task_title.replace(" ", "_").lower()

        if not filename:
            filename = task_title.replace(" ", "_") + ".py"

        if not os.path.exists(directory):
            create_nested_directories(directory)

        task_content = mdformat.text(md(task_content))

        save_to_file(f'#!/usr/bin/env python3\n"""{task_content}"""', os.path.join(directory, filename), executable=True)


In [37]:
def scrap_the_shit_out_of_the_page(html: str):

    soup = BeautifulSoup(html, "lxml")

    create_readme(soup)

    tasks = soup.find_all("div", id=lambda x: x and x.startswith("task-num-"))

    create_task_files(tasks)

scrap_the_shit_out_of_the_page(res.text)