Skip to content

Commit

Permalink
Fixed #15: database is kept by default
Browse files Browse the repository at this point in the history
Added force support to parse step
Cleaned-up readme
  • Loading branch information
rgaudin committed Apr 7, 2017
1 parent 7ee7b96 commit 3fc8d3a
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 44 deletions.
42 changes: 17 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,66 +1,56 @@
######################################################################
# Project Gutenberg Offline
######################################################################

A scraper that downloads the whole repository of [Project Gutenberg]
(http://www.gutenberg.org) and puts it into a localy browsable
(http://www.gutenberg.org) and puts it into a locally browsable
directory and then in a ZIM file (http://www.openzim.org), a clean and
user friendly format for storing content for offline usage. It was
created during a Kiwix Hackathon in Lyon, France in July 2014.

## Setting up the environment ########################################
## Setting up the environment

It's recommended that you use `virtualenv`.
It's recommended that you use `virtualenv`. `py2.7.x` and `py3.6+` are supported.

### Install the dependencies #########################################
### Install the dependencies

#### Linux

```
sudo apt-get install python-pip python-dev libxml2-dev libxslt-dev advancecomp jpegoptim pngquant p7zip-full gifsicle
sudo pip install virtualenvwrapper
sudo pip install virtualenv
```

#### Mac OS X

```
sudo easy_install pip
sudo pip install virtualenvwrapper
sudo pip install virtualenv
brew install advancecomp jpegoptim pngquant p7zip gifsicle
```

#### Finalize the setup #############################################

Finally, add this to your `.bashrc`:

```
source /usr/local/bin/virtualenvwrapper.sh
```

### Set up the project ##############################################
### Set up the project

```
git clone git@github.com:kiwix/gutenberg.git
cd gutenberg
mkvirtualenv gut (or any name you want)
virtualenv gut-env (or any name you want)
./gut-env/bin/pip install -r requirements.pip
```

### Working in the environment ######################################
### Working in the environment

* Activate the environment: `workon gut`
* Activate the environment: `source gut-env/bin/activate`
* Quit the environment: `deactivate`
* Install the python dependencies: `pip install -r requirements.pip`

## Getting started

After setting up the whole enviroment you can just run the main script `dump-gutenberg.py`.
After setting up the whole environment you can just run the main script `dump-gutenberg.py`.
It will download, process and export the content.

```
./dump-gutenberg.py
```

#### Arguments #####################################################
#### Arguments

You can also specify parameters to customize the content.
Only want books with the Id 100-200? Books only in French? English? Or only those both? No problem!
Expand All @@ -75,7 +65,8 @@ You can find the full arguments list below.

``` sh
-h --help Display this help message
-k --keep-db Do not wipe the DB during parse stage
-y --wipe-db Do not wipe the DB during parse stage
-F --force Redo step even if target already exist

-l --languages=<list> Comma-separated list of lang codes to filter export to (preferably ISO 639-1, else ISO 639-3)
-f --formats=<list> Comma-separated list of formats to filter export to (epub, html, pdf, all)
Expand All @@ -99,11 +90,12 @@ You can find the full arguments list below.
--parse Parse all RDF files and fill-up the DB
--download Download ebooks based on filters
--export Export downloaded content to zim-friendly static HTML
--dev Exports *just* Home+JS+CSS files (overwritten by --zim step)
--zim Create a ZIM file
```
## Screenshots #####################################################
## Screenshots
![](http://i.imgur.com/A4NnS2K.png?1)
Expand Down
15 changes: 9 additions & 6 deletions dump-gutenberg.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from gutenberg.checkdeps import check_dependencies


help = ("""Usage: dump-gutenberg.py [-k] [-F] [-l LANGS] [-f FORMATS] """
help = ("""Usage: dump-gutenberg.py [-y] [-F] [-l LANGS] [-f FORMATS] """
"""[-r RDF_FOLDER] [-m URL_MIRROR] [-d CACHE_PATH] [-e STATIC_PATH] """
"""[-z ZIM_PATH] [-u RDF_URL] [-b BOOKS] """
"""[-t ZIM_TITLE] [-n ZIM_DESC] """
Expand All @@ -28,7 +28,7 @@
"""[--zim] [--complete]
-h --help Display this help message
-k --keep-db Do not wipe the DB during parse stage
-y --wipe-db Do not wipe the DB during parse stage
-F --force Redo step even if target already exist
-l --languages=<list> Comma-separated list of lang codes to filter"""
Expand Down Expand Up @@ -84,7 +84,7 @@ def main(arguments):
RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
ZIM_FILE = arguments.get('--zim-file')
WIPE_DB = not arguments.get('--keep-db') or False
WIPE_DB = arguments.get('--wipe-db') or False
RDF_URL = arguments.get('--rdf-url') \
or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
Expand Down Expand Up @@ -138,13 +138,16 @@ def main(arguments):

if DO_PREPARE:
logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)
setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER, force=FORCE)

if WIPE_DB:
logger.info("RESETING DATABASE" if WIPE_DB else "SETTING UP DATABASE")
setup_database(wipe=WIPE_DB)

if DO_PARSE:
logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
setup_database(wipe=WIPE_DB)
parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS,
concurrency=CONCURRENCY)
concurrency=CONCURRENCY, force=FORCE)

if DO_DOWNLOAD:
logger.info("DOWNLOADING ebooks from mirror using filters")
Expand Down
19 changes: 14 additions & 5 deletions gutenberg/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,16 @@
db.connect()


class License(Model):
class BaseModel(Model):
@classmethod
def get_or_none(cls, *query, **kwargs):
try:
return cls.get(*query)
except cls.DoesNotExist:
return None


class License(BaseModel):

class Meta:
database = db
Expand All @@ -34,7 +43,7 @@ def __unicode__(self):
return self.name


class Format(Model):
class Format(BaseModel):

class Meta:
database = db
Expand All @@ -47,7 +56,7 @@ def __unicode__(self):
return self.mime


class Author(Model):
class Author(BaseModel):

class Meta:
database = db
Expand Down Expand Up @@ -105,7 +114,7 @@ def to_array(self):
]


class Book(Model):
class Book(BaseModel):

class Meta:
database = db
Expand Down Expand Up @@ -152,7 +161,7 @@ def formats(self):
return main_formats_for(self)


class BookFormat(Model):
class BookFormat(BaseModel):

class Meta:
database = db
Expand Down
20 changes: 12 additions & 8 deletions gutenberg/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
from gutenberg.utils import BAD_BOOKS_FORMATS, FORMAT_MATRIX, normalize


def setup_rdf_folder(rdf_url, rdf_path):
def setup_rdf_folder(rdf_url, rdf_path, force=False):
""" Download and Extract rdf-files """

rdf_tarball = download_rdf_file(rdf_url)
extract_rdf_files(rdf_tarball, rdf_path)
extract_rdf_files(rdf_tarball, rdf_path, force=force)


def download_rdf_file(rdf_url):
Expand All @@ -37,8 +37,8 @@ def download_rdf_file(rdf_url):
return fname


def extract_rdf_files(rdf_tarball, rdf_path):
if path(rdf_path).exists():
def extract_rdf_files(rdf_tarball, rdf_path, force=False):
if path(rdf_path).exists() and not force:
logger.info("\tRDF-files folder already exists in {}".format(rdf_path))
return

Expand All @@ -53,7 +53,7 @@ def extract_rdf_files(rdf_tarball, rdf_path):
return


def parse_and_fill(rdf_path, concurrency, only_books=[]):
def parse_and_fill(rdf_path, concurrency, only_books=[], force=False):
logger.info("\tLooping throught RDF files in {}".format(rdf_path))

fpaths = []
Expand All @@ -75,16 +75,20 @@ def parse_and_fill(rdf_path, concurrency, only_books=[]):

fpaths.append(os.path.join(root, fname))

Pool(concurrency).map(parse_and_process_file, fpaths)
ppf = lambda x: parse_and_process_file(x, force)
Pool(concurrency).map(ppf, fpaths)


def parse_and_process_file(rdf_file):
logger.info("\tParsing file {}".format(rdf_file))
def parse_and_process_file(rdf_file, force=False):
if not path(rdf_file).exists():
raise ValueError(rdf_file)

gid = re.match(r'.*/pg([0-9]+).rdf', rdf_file).groups()[0]
if Book.get_or_none(id=gid) and not force:
logger.info("Skipping existing {}".format(rdf_file))
return

logger.info("\tParsing file {}".format(rdf_file))
with open(rdf_file, 'r') as f:
parser = RdfParser(f.read(), gid).parse()

Expand Down

0 comments on commit 3fc8d3a

Please sign in to comment.