-
Notifications
You must be signed in to change notification settings - Fork 15
/
gutenberg.py
108 lines (96 loc) · 3.12 KB
/
gutenberg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# gutenberg.py
#
# Reformats and renames the downloaded etexts.
#
# Software by Michiel Overtoom, motoom@xs4all.nl, July 2009, amended April 2016.
#
import os
import re
import codecs
import glob
# Repetitive stuff I don't want to read a 1000 times on my eBook reader.
remove = ["Produced by","End of the Project Gutenberg","End of Project Gutenberg"]
def encoding(fn):
for line in open(fn):
if line.startswith("Character set encoding:"):
_, encoding = line.split(":")
return encoding.strip()
return "latin1"
codecmap = {
"latin1": "latin1",
"ISO Latin-1": "latin1",
"ISO-8859-1": "latin1",
"UTF-8": "utf8",
"ASCII": "ascii",
}
def beautify(fn, outputdir):
''' Reads a raw Project Gutenberg etext, reformat paragraphs,
and removes fluff. Determines the title of the book and uses it
as a filename to write the resulting output text. '''
codec = codecmap.get(encoding(fn), "latin1")
lines = [line.strip() for line in codecs.open(fn, "r", codec)]
collect = False
lookforsubtitle = False
outlines = []
startseen = endseen = False
title=""
for line in lines:
if line.startswith("Title: "):
title = line[7:]
lookforsubtitle = True
continue
if lookforsubtitle:
if not line.strip():
lookforsubtitle = False
else:
subtitle = line.strip()
subtitle = subtitle.strip(".")
title += ", " + subtitle
if ("*** START" in line) or ("***START" in line) or (line.startswith("*END THE SMALL PRINT!")):
collect = startseen = True
paragraph = ""
continue
if ("*** END" in line) or ("***END" in line):
endseen = True
break
if not collect:
continue
if not line:
paragraph = paragraph.strip()
for term in remove:
if paragraph.startswith(term):
paragraph = ""
if paragraph:
outlines.append(paragraph)
outlines.append("")
paragraph = ""
else:
paragraph += " " + line
# Compose a filename. Replace some illegal file name characters with alternatives.
lastpart = fn
parts = fn.split(os.sep)
if len(parts):
lastpart = parts[-1]
ofn = title[:150] + ", " + lastpart
ofn = ofn.replace("&", "en")
ofn = ofn.replace("/", "-")
ofn = ofn.replace("\"", "'")
ofn = ofn.replace(":", ";")
ofn = ofn.replace(",,", ",")
# Report on anomalous situations, but don't make it a showstopper.
if not title:
print ofn
print " Problem: No title found\n"
if not startseen:
print ofn
print " Problem: No '*** START' seen\n"
if not endseen:
print ofn
print " Problem: No '*** END' seen\n"
f = codecs.open(os.path.join(outputdir, ofn), "w", "utf8")
f.write("\n".join(outlines))
f.close()
if not os.path.exists("ebooks"):
os.mkdir("ebooks")
for fn in glob.glob("ebooks-unzipped/*.txt"):
beautify(fn, "ebooks")