Skip to content
Browse files

Initial version of book generator

  • Loading branch information...
0 parents commit c5dcc1e9bfd25516e1bccff91616006d6df926ec @palfrey committed May 20, 2012
Showing with 161 additions and 0 deletions.
  1. +8 −0 .gitignore
  2. +3 −0 .gitmodules
  3. +12 −0 Makefile
  4. +15 −0 blog.proto
  5. +2 −0 cmd
  6. +12 −0 dumper.py
  7. +75 −0 grabber.py
  8. +10 −0 loader.py
  9. +15 −0 makebooks.py
  10. +8 −0 series.txt
  11. +1 −0 urlgrab
8 .gitignore
@@ -0,0 +1,8 @@
+series.list
+*.mobi
+*.zip
+cache
+*_pb2.py
+dump.html
+dump
+Tales of Mu*
3 .gitmodules
@@ -0,0 +1,3 @@
+[submodule "urlgrab"]
+ path = urlgrab
+ url = git://github.com/palfrey/urlgrab.git
12 Makefile
@@ -0,0 +1,12 @@
+all: blog_pb2.py
+
+blog_pb2.py: blog.proto
+ protoc --python_out=. blog.proto
+
+load::
+ python loader.py series.txt series.list
+
+dump::
+ python dumper.py series.list series.txt
+
+.PHONY: load dump
15 blog.proto
@@ -0,0 +1,15 @@
+package blog;
+
+message Series
+{
+ required string name = 1;
+ required string startPage = 2;
+ required string titlePattern = 3;
+ required string contentPattern = 4;
+ required string nextPattern = 5;
+}
+
+message All
+{
+ repeated Series series = 1;
+}
2 cmd
@@ -0,0 +1,2 @@
+#!/bin/bash
+rm -f tom.zip && zip -j tom.zip Tales\ of\ Mu/* && ebook-convert tom.zip tom.mobi --output-profile kindle --margin-top 0 --margin-bottom 0 --margin-left 0 --authors="Alexandra Erin" --enable-heuristics
12 dumper.py
@@ -0,0 +1,12 @@
+from google.protobuf import text_format
+from sys import argv
+from codecs import open
+
+from blog_pb2 import All
+
+db = All()
+db.ParseFromString(open(argv[1],"rb").read())
+out = text_format.MessageToString(db)
+open(argv[2],"wb").write(out)
+
+
75 grabber.py
@@ -0,0 +1,75 @@
+from urlgrab import Cache
+from blog_pb2 import All
+from re import compile, DOTALL, MULTILINE
+from os import mkdir
+from os.path import exists, join
+from hashlib import md5
+
+c = Cache()
+
+series = "Tales of Mu"
+
+db = All()
+db.ParseFromString(open("series.list", "rb").read())
+
+for s in db.series:
+ if s.name == series:
+ print s
+ page = s.startPage
+ index = 1
+ while page!=None:
+ folder = "%s #%02d"%(s.name, index)
+ if not exists(folder):
+ mkdir(folder)
+ toc = open(join(folder, "toc.html"), "wb")
+ toc.write("""<html xmlns="http://www.w3.org/1999/xhtml">
+ <head>
+ <title>%s</title>
+ </head>
+ <body class="vcenter">
+ <div style="display:none">
+ """ % folder)
+ titlePattern = compile(s.titlePattern, DOTALL | MULTILINE)
+ contentPattern = compile(s.contentPattern, DOTALL | MULTILINE)
+ nextPattern = compile(s.nextPattern, DOTALL | MULTILINE)
+ for x in range(20):
+ print "generating", page
+ data = c.get(page, max_age=-1).read()
+ open("dump", "wb").write(data)
+
+ title = titlePattern.search(data)
+ assert title != None, page
+ title = title.groups()[0]
+
+ link = nextPattern.search(data)
+
+ content = contentPattern.search(data)
+ assert content != None, page
+ content = content.groups()[0]
+
+ fname = md5(page).hexdigest() + ".html"
+ open (join(folder, fname), "wb").write("""<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
+ <head>
+ <style type="text/css" title="override_css">
+ @page {padding: 0pt; margin:0pt}
+ </style>
+ <title>%s</title>
+ </head>
+ <body>
+ <h1>%s</h1>
+ %s
+ </body>
+ </html>"""%(title, title, content))
+ toc.write("\t\t\t<a title=\"%s\" href=\"%s\" />\n"%(title, fname))
+ if link != None:
+ link = link.groups()[0]
+ page = link
+ if page == None:
+ break
+ toc.write("""\t\t</div>
+ </body>
+ </html>""")
+ toc.close()
+
+ if page != None:
+ index +=1
10 loader.py
@@ -0,0 +1,10 @@
+from google.protobuf import text_format
+from sys import argv
+from codecs import open
+
+from blog_pb2 import All
+
+db = All()
+text_format.Merge(open(argv[1],"rb","utf-8").read(),db)
+open(argv[2],"wb").write(db.SerializeToString())
+
15 makebooks.py
@@ -0,0 +1,15 @@
+from os import listdir, system
+from os.path import isdir, exists
+
+for x in sorted(listdir(".")):
+ if not isdir(x):
+ continue
+ if x in ("urlgrab", "cache") or x[0] == ".":
+ continue
+ fname = x + ".mobi"
+ if not exists(fname):
+ print fname
+ cmd = "rm -f tom.zip && zip -j tom.zip %s/* && ebook-convert tom.zip \"%s\" --output-profile kindle --margin-top 0 --margin-bottom 0 --margin-left 0 --authors=\"Alexandra Erin\" --enable-heuristics" %(x.replace(" ", "\\ "), fname)
+ print cmd
+ system(cmd)
+
8 series.txt
@@ -0,0 +1,8 @@
+series
+{
+ name: "Tales of Mu"
+ startPage: "http://www.talesofmu.com/story/book01/1"
+ titlePattern: "<div class=\"date\"><h2>(.+?)</h2>"
+ contentPattern: "<div class=\"entry\">(.*?)<center>"
+ nextPattern: "<a href=\"([^\"]+)\" rel=\"next\">"
+}
1 urlgrab
@@ -0,0 +1 @@
+Subproject commit 62192a2b120b0118ac60240afa412bfd27360c65

0 comments on commit c5dcc1e

Please sign in to comment.
Something went wrong with that request. Please try again.