Fixed #15: database is kept by default

Added force support to parse step Cleaned-up readme
openzim · Apr 7, 2017 · 3fc8d3a · 3fc8d3a
1 parent 7ee7b96
commit 3fc8d3a
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 44 deletions.
diff --git a/README.md b/README.md
@@ -1,66 +1,56 @@
-######################################################################
 # Project Gutenberg Offline
-######################################################################
 
 A scraper that downloads the whole repository of [Project Gutenberg]
-(http://www.gutenberg.org) and puts it into a localy browsable
+(http://www.gutenberg.org) and puts it into a locally browsable
 directory and then in a ZIM file (http://www.openzim.org), a clean and
 user friendly format for storing content for offline usage. It was
 created during a Kiwix Hackathon in Lyon, France in July 2014.
 
-## Setting up the environment ########################################
+## Setting up the environment
 
-It's recommended that you use `virtualenv`.
+It's recommended that you use `virtualenv`. `py2.7.x` and `py3.6+` are supported.
 
-### Install the dependencies #########################################
+### Install the dependencies
 
 #### Linux 
 
 ```
 sudo apt-get install python-pip python-dev libxml2-dev libxslt-dev advancecomp jpegoptim pngquant p7zip-full gifsicle
-sudo pip install virtualenvwrapper
+sudo pip install virtualenv
 ```
 
 #### Mac OS X
 
 ```
 sudo easy_install pip
-sudo pip install virtualenvwrapper
+sudo pip install virtualenv
 brew install advancecomp jpegoptim pngquant p7zip gifsicle
 ```
 
-#### Finalize the setup #############################################
-
-Finally, add this to your `.bashrc`:
-
-```
-source /usr/local/bin/virtualenvwrapper.sh
-```
-
-### Set up the project ##############################################
+### Set up the project
 
 ```
 git clone git@github.com:kiwix/gutenberg.git
 cd gutenberg
-mkvirtualenv gut (or any name you want)
+virtualenv gut-env (or any name you want)
+./gut-env/bin/pip install -r requirements.pip
 ```
 
-### Working in the environment ######################################
+### Working in the environment
 
-* Activate the environment:  `workon gut`
+* Activate the environment:  `source gut-env/bin/activate`
 * Quit the environment: `deactivate`
-* Install the python dependencies: `pip install -r requirements.pip`
 
 ## Getting started
 
-After setting up the whole enviroment you can just run the main script `dump-gutenberg.py`.   
+After setting up the whole environment you can just run the main script `dump-gutenberg.py`.   
 It will download, process and export the content.
 
 ```
 ./dump-gutenberg.py 
 ```
 
-#### Arguments #####################################################
+#### Arguments
 
 You can also specify parameters to customize the content.   
 Only want books with the Id 100-200? Books only in French? English? Or only those both? No problem!  
@@ -75,7 +65,8 @@ You can find the full arguments list below.
 
 ``` sh
 -h --help                       Display this help message
--k --keep-db                    Do not wipe the DB during parse stage
+-y --wipe-db                    Do not wipe the DB during parse stage
+-F --force                      Redo step even if target already exist
 
 -l --languages=<list>           Comma-separated list of lang codes to filter export to (preferably ISO 639-1, else ISO 639-3)
 -f --formats=<list>             Comma-separated list of formats to filter export to (epub, html, pdf, all)
@@ -99,11 +90,12 @@ You can find the full arguments list below.
 --parse                         Parse all RDF files and fill-up the DB
 --download                      Download ebooks based on filters
 --export                        Export downloaded content to zim-friendly static HTML
+--dev                           Exports *just* Home+JS+CSS files (overwritten by --zim step)
 --zim                           Create a ZIM file
 ```
 
 
-## Screenshots #####################################################
+## Screenshots
 
 ![](http://i.imgur.com/A4NnS2K.png?1)
 

diff --git a/dump-gutenberg.py b/dump-gutenberg.py
@@ -19,7 +19,7 @@
 from gutenberg.checkdeps import check_dependencies
 
 
-help = ("""Usage: dump-gutenberg.py [-k] [-F] [-l LANGS] [-f FORMATS] """
+help = ("""Usage: dump-gutenberg.py [-y] [-F] [-l LANGS] [-f FORMATS] """
         """[-r RDF_FOLDER] [-m URL_MIRROR] [-d CACHE_PATH] [-e STATIC_PATH] """
         """[-z ZIM_PATH] [-u RDF_URL] [-b BOOKS] """
         """[-t ZIM_TITLE] [-n ZIM_DESC] """
@@ -28,7 +28,7 @@
         """[--zim] [--complete]
 
 -h --help                       Display this help message
--k --keep-db                    Do not wipe the DB during parse stage
+-y --wipe-db                    Do not wipe the DB during parse stage
 -F --force                      Redo step even if target already exist
 
 -l --languages=<list>           Comma-separated list of lang codes to filter"""
@@ -84,7 +84,7 @@ def main(arguments):
     RDF_FOLDER = arguments.get('--rdf-folder') or os.path.join('rdf-files')
     STATIC_FOLDER = arguments.get('--static-folder') or os.path.join('static')
     ZIM_FILE = arguments.get('--zim-file')
-    WIPE_DB = not arguments.get('--keep-db') or False
+    WIPE_DB = arguments.get('--wipe-db') or False
     RDF_URL = arguments.get('--rdf-url') \
         or 'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2'
     DL_CACHE = arguments.get('--dl-folder') or os.path.join('dl-cache')
@@ -138,13 +138,16 @@ def main(arguments):
 
     if DO_PREPARE:
         logger.info("PREPARING rdf-files cache from {}".format(RDF_URL))
-        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER)
+        setup_rdf_folder(rdf_url=RDF_URL, rdf_path=RDF_FOLDER, force=FORCE)
+
+    if WIPE_DB:
+        logger.info("RESETING DATABASE" if WIPE_DB else "SETTING UP DATABASE")
+        setup_database(wipe=WIPE_DB)
 
     if DO_PARSE:
         logger.info("PARSING rdf-files in {}".format(RDF_FOLDER))
-        setup_database(wipe=WIPE_DB)
         parse_and_fill(rdf_path=RDF_FOLDER, only_books=BOOKS,
-                       concurrency=CONCURRENCY)
+                       concurrency=CONCURRENCY, force=FORCE)
 
     if DO_DOWNLOAD:
         logger.info("DOWNLOADING ebooks from mirror using filters")

diff --git a/gutenberg/database.py b/gutenberg/database.py
@@ -15,7 +15,16 @@
 db.connect()
 
 
-class License(Model):
+class BaseModel(Model):
+    @classmethod
+    def get_or_none(cls, *query, **kwargs):
+        try:
+            return cls.get(*query)
+        except cls.DoesNotExist:
+            return None
+
+
+class License(BaseModel):
 
     class Meta:
         database = db
@@ -34,7 +43,7 @@ def __unicode__(self):
         return self.name
 
 
-class Format(Model):
+class Format(BaseModel):
 
     class Meta:
         database = db
@@ -47,7 +56,7 @@ def __unicode__(self):
         return self.mime
 
 
-class Author(Model):
+class Author(BaseModel):
 
     class Meta:
         database = db
@@ -105,7 +114,7 @@ def to_array(self):
         ]
 
 
-class Book(Model):
+class Book(BaseModel):
 
     class Meta:
         database = db
@@ -152,7 +161,7 @@ def formats(self):
         return main_formats_for(self)
 
 
-class BookFormat(Model):
+class BookFormat(BaseModel):
 
     class Meta:
         database = db

diff --git a/gutenberg/rdf.py b/gutenberg/rdf.py
@@ -17,11 +17,11 @@
 from gutenberg.utils import BAD_BOOKS_FORMATS, FORMAT_MATRIX, normalize
 
 
-def setup_rdf_folder(rdf_url, rdf_path):
+def setup_rdf_folder(rdf_url, rdf_path, force=False):
     """ Download and Extract rdf-files """
 
     rdf_tarball = download_rdf_file(rdf_url)
-    extract_rdf_files(rdf_tarball, rdf_path)
+    extract_rdf_files(rdf_tarball, rdf_path, force=force)
 
 
 def download_rdf_file(rdf_url):
@@ -37,8 +37,8 @@ def download_rdf_file(rdf_url):
     return fname
 
 
-def extract_rdf_files(rdf_tarball, rdf_path):
-    if path(rdf_path).exists():
+def extract_rdf_files(rdf_tarball, rdf_path, force=False):
+    if path(rdf_path).exists() and not force:
         logger.info("\tRDF-files folder already exists in {}".format(rdf_path))
         return
 
@@ -53,7 +53,7 @@ def extract_rdf_files(rdf_tarball, rdf_path):
     return
 
 
-def parse_and_fill(rdf_path, concurrency, only_books=[]):
+def parse_and_fill(rdf_path, concurrency, only_books=[], force=False):
     logger.info("\tLooping throught RDF files in {}".format(rdf_path))
 
     fpaths = []
@@ -75,16 +75,20 @@ def parse_and_fill(rdf_path, concurrency, only_books=[]):
 
             fpaths.append(os.path.join(root, fname))
 
-    Pool(concurrency).map(parse_and_process_file, fpaths)
+    ppf = lambda x: parse_and_process_file(x, force)
+    Pool(concurrency).map(ppf, fpaths)
 
 
-def parse_and_process_file(rdf_file):
-    logger.info("\tParsing file {}".format(rdf_file))
+def parse_and_process_file(rdf_file, force=False):
     if not path(rdf_file).exists():
         raise ValueError(rdf_file)
 
     gid = re.match(r'.*/pg([0-9]+).rdf', rdf_file).groups()[0]
+    if Book.get_or_none(id=gid) and not force:
+        logger.info("Skipping existing {}".format(rdf_file))
+        return
 
+    logger.info("\tParsing file {}".format(rdf_file))
     with open(rdf_file, 'r') as f:
         parser = RdfParser(f.read(), gid).parse()