This repository has been archived by the owner on Oct 25, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
common.py
97 lines (80 loc) · 2.71 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Common functions and definitions.
from optparse import OptionParser
import logging
import os
import pycurl
import time
# Max cache age.
MAX_CACHE_AGE = 60
def CreateOptionsParser():
"""Return a parser for command line with common options."""
parser = OptionParser()
parser.add_option('-c', '--cache_dir', dest='cache_dir',
help='store data to DIRNAME')
parser.add_option('-z', '--online', dest='online',
help='wether we are online.')
parser.add_option('-v', '--log', dest='loglevel',
help='set the log level')
return parser
def ApplyOptionsParser(parser):
"""Apply a parser created with CreateOptionsParser."""
(options, args) = parser.parse_args()
if options.loglevel:
logging.basicConfig(level=getattr(logging, options.loglevel.upper()))
online = True
if options.online is "false":
online = False
logging.info("Online: %s", online)
options.online = online
if not options.cache_dir:
raise Error('You need to specify a cache directory (--cache_dir).')
return options
def CacheCurl(base_fn, page_url, online, cache_dir):
"""Download an URL if possible and needed."""
html_fn = os.path.join(cache_dir, base_fn)
html_exists = os.path.exists(html_fn)
html_download = False
if html_exists:
# Test if the file is old.
st = os.stat(html_fn)
mtime = st.st_mtime
file_age = time.time() - mtime
if MAX_CACHE_AGE < file_age:
if online:
logging.info("The file '%s' is %d seconds old, redownloading.",
html_fn, file_age)
html_download = True
else:
logging.info("The file '%s' is %d seconds old " +
"but not online, so no redownload.",
html_fn, file_age)
else:
logging.info("The file '%s' is recent enough.", html_fn)
else:
logging.info("The file '%s' doesn't exist, downloading.", html_fn)
html_download = True
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
if html_download:
download_ok = False
with open(html_fn, "wb") as fp:
curl = pycurl.Curl()
curl.setopt(pycurl.URL, page_url)
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, 5)
curl.setopt(pycurl.CONNECTTIMEOUT, 30)
curl.setopt(pycurl.TIMEOUT, 300)
curl.setopt(pycurl.NOSIGNAL, 1)
curl.setopt(pycurl.WRITEDATA, fp)
try:
curl.perform()
logging.info("Download of '%s' done.", page_url)
download_ok = True
except:
import traceback
traceback.print_exc(file=sys.stderr)
curl.close()
if not download_ok:
os.remove(html_fn)
raise Error("Not able to download '%s'" % page_url)
return html_fn