# Freecodecamp to Epub

## Importa Bibliotecas

In [1]:
import os
import requests
from bs4 import BeautifulSoup
from ebooklib import epub
import urllib

## Constantes

In [2]:
PROJECT_DIR = os.getcwd()
URL = 'https://www.freecodecamp.org/news/learn-java-object-oriented-programming/'
OUTPUT_DIR = './output/epub/'
CHAPTER_TAG = 'h1'

## Funções

In [3]:
def remove_currrent_dir(path):
    return path.replace(PROJECT_DIR + '/', '')

In [4]:
def get_images_url(html):
    url_images = []

    for img_tag in content.find_all('img'):
        url = img_tag.get('src')
        if url:
            url_images.append(url)
    
    return url_images

In [5]:
def url_to_local_path(url):
    base_dir = PROJECT_DIR + '/downloads'
    
    parsed_url = urllib.parse.urlparse(url)
    domain = parsed_url.netloc
    path = parsed_url.path.lstrip('/')
    domain_cleaned = ''.join(c if c.isalnum() or c in ['.', '-'] else '_' for c in domain)
    file_path = os.path.join(base_dir, domain_cleaned, path)
    
    return file_path

In [6]:
def download_images(urls):
    paths = [url_to_local_path(url) for url in images_url]
    downloaded_images = []
    
    for i, (url, path) in enumerate(zip(urls, paths)):
        os.makedirs(os.path.dirname(path), exist_ok=True)
        response = requests.get(url)
        
        if response.status_code == 200:
            with open(path, 'wb') as f:
                f.write(response.content)
            downloaded_images.append(remove_currrent_dir(path))

    return downloaded_images

In [7]:
def replace_img_srcs(html):

    soup = BeautifulSoup(str(html), 'html.parser')

    for img in soup.find_all('img'):
        img['src'] = remove_currrent_dir(url_to_local_path(img['src']))
    
    return str(soup)

In [8]:
def add_images(book, images_path):
    for index, image in enumerate(images_path):
        
        img = epub.EpubImage(
            uid = f"image{index}",
            file_name = image,
            media_type = 'image/gif',
            content = open(image, "rb").read(),
        )
    
        book.add_item(img)


In [9]:
def add_chapters(book, html, chapter_tag):
    spine = ['nav']
    chapters = []
    content = BeautifulSoup(str(html), 'html.parser')
    chapter_content = ''
    
    for element in content.find('section').contents:
        
        if (element.name == chapter_tag):
            
            chapters.append(chapter_content)
            chapter_content = ''

        chapter_content += str(element)

    chapters.append(chapter_content)
        
    
    for index, chapter in enumerate(chapters):
        
        soup = BeautifulSoup(str(chapter), 'html.parser')
        c_title = soup.find(chapter_tag).text if soup.find(chapter_tag) else ''
        c = epub.EpubHtml(title=title, file_name=('chap_' + str(index) + '.xhtml'))
        c.content = str(soup)

        book.add_item(c)
        spine.append(c)

    ebook.spine = spine

## Faz scrapping do HTML

In [10]:
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'lxml')
ebook = epub.EpubBook()

In [11]:
title = soup.title.text.strip()
content = soup.find('section', {'class': 'post-content'});
author = soup.find('a', {'data-test-label': 'profile-link'}).text.strip()

## Gera o eBook

In [12]:
ebook.add_author(author)
ebook.set_title(title)

### Adiciona imagens no ebook

In [13]:
images_url = get_images_url(content)

In [14]:
ebook_images_path = download_images(images_url)

In [15]:
add_images(ebook, ebook_images_path)

### Gera os capítulos

In [16]:
ebook_content = replace_img_srcs(content)

In [17]:
add_chapters(ebook, ebook_content, CHAPTER_TAG)

### Gera o documento

In [18]:
epub.write_epub( OUTPUT_DIR + (title + '.epub'), ebook, {})

  return self._open_to_write(zinfo, force_zip64=force_zip64)
