Skip to content

Commit

Permalink
Refactored code for pypi
Browse files Browse the repository at this point in the history
gutenberg/ moved to gutenbergtozim
dump-gutenberg.py moved to gutenberg2zim
  • Loading branch information
rgaudin committed Apr 7, 2017
1 parent 3fc8d3a commit 26b67fc
Show file tree
Hide file tree
Showing 147 changed files with 179 additions and 52 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ virtualenv gut-env (or any name you want)

## Getting started

After setting up the whole environment you can just run the main script `dump-gutenberg.py`.
After setting up the whole environment you can just run the main script `gutenberg2zim`.
It will download, process and export the content.

```
./dump-gutenberg.py
./gutenberg2zim
```

#### Arguments
Expand All @@ -57,7 +57,7 @@ Only want books with the Id 100-200? Books only in French? English? Or only thos
You can also include or exclude book formats.

```
./dump-gutenberg.py -l en,fr -f pdf --books 100-200
./gutenberg2zim -l en,fr -f pdf --books 100-200
```
This will download English and French books that have the Id 100 to 200 in the html (default) and pdf format.

Expand Down
14 changes: 7 additions & 7 deletions dump-gutenberg.py → gutenberg2zim
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@
from docopt import docopt
from path import Path as path

from gutenberg import logger
from gutenberg.database import setup_database
from gutenberg.rdf import setup_rdf_folder, parse_and_fill
from gutenberg.download import download_all_books
from gutenberg.export import export_all_books, export_skeleton
from gutenberg.zim import build_zimfile
from gutenberg.checkdeps import check_dependencies
from gutenbergtozim import logger
from gutenbergtozim.database import setup_database
from gutenbergtozim.rdf import setup_rdf_folder, parse_and_fill
from gutenbergtozim.download import download_all_books
from gutenbergtozim.export import export_all_books, export_skeleton
from gutenbergtozim.zim import build_zimfile
from gutenbergtozim.checkdeps import check_dependencies


help = ("""Usage: dump-gutenberg.py [-y] [-F] [-l LANGS] [-f FORMATS] """
Expand Down
6 changes: 0 additions & 6 deletions gutenberg/__init__.py → gutenbergtozim/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,6 @@
import logging
from logging.config import dictConfig

try:
import lxml
except ImportError:
XML_PARSER = "html.parser"
else:
XML_PARSER = "lxml"

LOGGING = {
'version': 1,
Expand Down
2 changes: 1 addition & 1 deletion gutenberg/checkdeps.py → gutenbergtozim/checkdeps.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
division, print_function)
import subprocess

from gutenberg import logger
from gutenbergtozim import logger


def check_dependencies():
Expand Down
4 changes: 2 additions & 2 deletions gutenberg/database.py → gutenbergtozim/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
CharField, BooleanField,
IntegerField, ForeignKeyField)

from gutenberg import logger
from gutenbergtozim import logger

db = SqliteDatabase('gutenberg.db')
db.connect()
Expand Down Expand Up @@ -157,7 +157,7 @@ def to_array(self):
]

def formats(self):
from gutenberg.utils import main_formats_for
from gutenbergtozim.utils import main_formats_for
return main_formats_for(self)


Expand Down
10 changes: 5 additions & 5 deletions gutenberg/download.py → gutenbergtozim/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
import requests
from path import Path as path

from gutenberg import logger, TMP_FOLDER
from gutenberg.urls import get_urls
from gutenberg.database import BookFormat, Format
from gutenberg.export import get_list_of_filtered_books, fname_for
from gutenberg.utils import download_file, FORMAT_MATRIX
from gutenbergtozim import logger, TMP_FOLDER
from gutenbergtozim.urls import get_urls
from gutenbergtozim.database import BookFormat, Format
from gutenbergtozim.export import get_list_of_filtered_books, fname_for
from gutenbergtozim.utils import download_file, FORMAT_MATRIX


def resource_exists(url):
Expand Down
29 changes: 17 additions & 12 deletions gutenberg/export.py → gutenbergtozim/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,18 @@
from jinja2 import Environment, PackageLoader
from multiprocessing.dummy import Pool

import gutenberg
from gutenberg import logger, TMP_FOLDER
from gutenberg.utils import (FORMAT_MATRIX, main_formats_for,
get_list_of_filtered_books, exec_cmd,
get_langs_with_count, get_lang_groups,
is_bad_cover, path_for_cmd, read_file, zip_epub)
from gutenberg.database import Book, Format, BookFormat, Author
from gutenberg.iso639 import language_name
from gutenberg.l10n import l10n_strings

jinja_env = Environment(loader=PackageLoader('gutenberg', 'templates'))
import gutenbergtozim
from gutenbergtozim import logger, TMP_FOLDER
from gutenbergtozim.utils import (FORMAT_MATRIX, main_formats_for,
get_list_of_filtered_books, exec_cmd,
get_langs_with_count, get_lang_groups,
is_bad_cover, path_for_cmd, read_file,
zip_epub, critical_error)
from gutenbergtozim.database import Book, Format, BookFormat, Author
from gutenbergtozim.iso639 import language_name
from gutenbergtozim.l10n import l10n_strings

jinja_env = Environment(loader=PackageLoader('gutenbergtozim', 'templates'))

UTF8 = 'utf-8'
DEBUG_COUNT = []
Expand Down Expand Up @@ -93,7 +94,7 @@ def save_bs_output(soup, fpath, encoding=UTF8):


def tmpl_path():
return os.path.join(path(gutenberg.__file__).parent, 'templates')
return os.path.join(path(gutenbergtozim.__file__).parent, 'templates')


def get_list_of_all_languages():
Expand Down Expand Up @@ -146,6 +147,10 @@ def export_all_books(static_folder,
formats=formats,
only_books=only_books)

if not len(get_langs_with_count(books=books)):
critical_error("Unable to proceed. Combination of lamguages, "
"books and formats has no result.")

sz = len(list(books))
logger.debug("\tFiltered book collection size: {}".format(sz))

Expand Down
File renamed without changes.
File renamed without changes.
10 changes: 5 additions & 5 deletions gutenberg/rdf.py → gutenbergtozim/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
from path import Path as path
from bs4 import BeautifulSoup

from gutenberg import logger, XML_PARSER
from gutenberg.utils import exec_cmd, download_file
from gutenberg.database import (Author, Format, BookFormat, License, Book)
from gutenberg.utils import BAD_BOOKS_FORMATS, FORMAT_MATRIX, normalize
from gutenbergtozim import logger
from gutenbergtozim.utils import exec_cmd, download_file
from gutenbergtozim.database import (Author, Format, BookFormat, License, Book)
from gutenbergtozim.utils import BAD_BOOKS_FORMATS, FORMAT_MATRIX, normalize


def setup_rdf_folder(rdf_url, rdf_path, force=False):
Expand Down Expand Up @@ -113,7 +113,7 @@ def __init__(self, rdf_data, gid):
self.last_name = None

def parse(self):
soup = BeautifulSoup(self.rdf_data, XML_PARSER)
soup = BeautifulSoup(self.rdf_data, "lxml")

# The tile of the book: this may or may not be divided
# into a new-line-seperated title and subtitle.
Expand Down
File renamed without changes.
11 changes: 11 additions & 0 deletions gutenbergtozim/templates/author.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{% extends "base.html" %}
{% block title %}{{ author.name() }}{% endblock %}
{% block content %}
<table class="data-table display" cellpadding="0" cellspacing="0" border="0" id="books_table"></table>
<script type="text/javascript">
$( "#author_filter" ).val("{{ author.name() }}");
$( "#author_filter" ).change();
$.persistValue("author_filter", $( "#author_filter" ).val(), persist_options);
showBooks();
</script>
{% endblock %}
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
6 changes: 3 additions & 3 deletions gutenberg/urls.py → gutenbergtozim/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@

from collections import defaultdict

from gutenberg.database import Book, BookFormat
from gutenberg.utils import FORMAT_MATRIX
from gutenberg import logger
from gutenbergtozim.database import Book, BookFormat
from gutenbergtozim.utils import FORMAT_MATRIX
from gutenbergtozim import logger


class UrlBuilder:
Expand Down
14 changes: 10 additions & 4 deletions gutenberg/utils.py → gutenbergtozim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import (unicode_literals, absolute_import,
division, print_function)
import os
import sys
import re
import hashlib
import subprocess
Expand All @@ -17,9 +18,9 @@
import chardet
from path import Path as path

from gutenberg import logger
from gutenberg.iso639 import language_name
from gutenberg.database import Book, BookFormat, Format
from gutenbergtozim import logger
from gutenbergtozim.iso639 import language_name
from gutenbergtozim.database import Book, BookFormat, Format


FORMAT_MATRIX = collections.OrderedDict([
Expand All @@ -37,6 +38,11 @@
NB_MAIN_LANGS = 5


def critical_error(message):
logger.critical("ERROR: {}".format(message))
sys.exit(1)


def normalize(text=None):
return None if text is None else unicodedata.normalize('NFC', text)

Expand Down Expand Up @@ -94,7 +100,7 @@ def get_list_of_filtered_books(languages, formats, only_books=[]):
qs = Book.select()

if len(only_books):
print(only_books)
# print(only_books)
qs = qs.where(Book.id << only_books)

if len(languages):
Expand Down
8 changes: 4 additions & 4 deletions gutenberg/zim.py → gutenbergtozim/zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

from path import Path as path

from gutenberg import logger
from gutenberg.utils import exec_cmd
from gutenberg.iso639 import ISO_MATRIX
from gutenberg.export import export_skeleton
from gutenbergtozim import logger
from gutenbergtozim.utils import exec_cmd
from gutenbergtozim.iso639 import ISO_MATRIX
from gutenbergtozim.export import export_skeleton


def build_zimfile(static_folder, zim_path=None,
Expand Down
69 changes: 69 additions & 0 deletions pypi-readme.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
=============
gutenberg2zim
=============

A scraper that downloads the whole repository of [Project Gutenberg]
(http://www.gutenberg.org) and puts it into a locally browsable
directory and then in a ZIM file (http://www.openzim.org), a clean and
user friendly format for storing content for offline usage.

------------
Dependencies
------------

Ubuntu/debian
-------------

.. code-block:: sh
python-pip python-dev libxml2-dev libxslt-dev advancecomp jpegoptim pngquant p7zip-full gifsicle
macOS
-----

.. code-block:: sh
brew install advancecomp jpegoptim pngquant p7zip gifsicle
------
Usage
------

.. code-block:: sh
gutenberg2zim
By default (no argument), it runs all the steps: download, parse, export and zim.


.. code-block:: sh
-h --help Display this help message
-y --wipe-db Do not wipe the DB during parse stage
-F --force Redo step even if target already exist
-l --languages=<list> Comma-separated list of lang codes to filter export to (preferably ISO 639-1, else ISO 639-3)
-f --formats=<list> Comma-separated list of formats to filter export to (epub, html, pdf, all)
-m --mirror=<url> Use URL as base for all downloads.
-r --rdf-folder=<folder> Don't download rdf-files.tar.bz2 and use extracted folder instead
-e --static-folder=<folder> Use-as/Write-to this folder static HTML
-z --zim-file=<file> Write ZIM into this file path
-t --zim-title=<title> Set ZIM title
-n --zim-desc=<description> Set ZIM description
-d --dl-folder=<folder> Folder to use/write-to downloaded ebooks
-u --rdf-url=<url> Alternative rdf-files.tar.bz2 URL
-b --books=<ids> Execute the processes for specific books, separated by commas, or dashes for intervals
-c --concurrency=<nb> Number of concurrent process for download and parsing tasks
-x --zim-title=<title> Custom title for the ZIM file
-q --zim-desc=<desc> Custom description for the ZIM file
--check Check dependencies
--prepare Download & extract rdf-files.tar.bz2
--parse Parse all RDF files and fill-up the DB
--download Download ebooks based on filters
--export Export downloaded content to zim-friendly static HTML
--dev Exports *just* Home+JS+CSS files (overwritten by --zim step)
--zim Create a ZIM file
42 changes: 42 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu

""" Project Gutemberg ZIM creator for Offline Use """

from codecs import open

from setuptools import setup, find_packages

with open('pypi-readme.rst', 'r', 'utf-8') as f:
readme = f.read()

with open('requirements.pip', 'r') as f:
requirements = [l.strip() for l in f.readlines() if len(l.strip())]

setup(
name='gutenberg2zim',
version="1.0",
description=__doc__,
long_description=readme,
author="Kiwix",
author_email="reg@kiwix.org",
url='http://github.com/kiwix/gutenberg',
keywords="gutenberg zim kiwix openzim offline",
license="GNU GPL",
packages=find_packages('.'),
zip_safe=False,
platforms='any',
include_package_data=True,
package_data={'': ['pypi-readme.rst', 'LICENSE']},
package_dir={'gutenberg': 'gutenberg'},
install_requires=requirements,
scripts=['gutenberg2zim'],
classifiers=[
'Intended Audience :: Developers',
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)'
'Programming Language :: Python',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.6',
],
)

0 comments on commit 26b67fc

Please sign in to comment.