Permalink
Browse files

Move everything and make grab-site installable with pip3

  • Loading branch information...
ivan committed Jul 18, 2015
1 parent 1266cf6 commit 43d8a9594ff18d60c8806c2546e220a20200f3ce
View
@@ -1 +1 @@
-/__pycache__
+__pycache__
View
@@ -37,17 +37,18 @@ echo "global,$igsets" > "$dir/igsets"
touch "$dir/igoff"
touch "$dir/ignores"
+LIBGRABSITE="$(python3 -c 'import os, libgrabsite; print(os.path.dirname(libgrabsite.__file__))')"
+
# Note: we use the default html5lib parser instead of the lxml that ArchiveBot uses
# html5lib is slower, but is better at parsing and doesn't (rarely) corrupt the heap like lxml
-
-GRAB_SITE_WORKING_DIR="$dir" PYTHONPATH="$self" "$self/wpull" \
+GRAB_SITE_WORKING_DIR="$dir" "$self/patched-wpull" \
-U "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" \
--header="Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" \
--header="Accept-Language: en-US,en;q=0.5" \
-o "$dir/wpull.log" \
--database "$dir/wpull.db" \
- --plugin-script "$self/plugin.py" \
- --python-script "$self/wpull_hooks.py" \
+ --plugin-script "$LIBGRABSITE/plugin.py" \
+ --python-script "$LIBGRABSITE/wpull_hooks.py" \
--plugin-args " --dupes-db $dir/dupes_db" \
--save-cookies "$dir/cookies.txt" \
--no-check-certificate \
View
@@ -0,0 +1,4 @@
+#!/usr/bin/python3
+
+from libgrabsite import server
+server.main()
View
@@ -0,0 +1 @@
+__version__ = '0.1.0'
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
@@ -6,8 +6,8 @@
from wpull.document.html import HTMLReader
import wpull.processor.rule
-import dupespotter
-from dupes import DupesInMemory, DupesOnDisk
+from libgrabsite import dupespotter
+from libgrabsite.dupes import DupesInMemory, DupesOnDisk
File renamed without changes.
@@ -7,7 +7,7 @@
import trollius as asyncio
from urllib.request import urlopen
from autobahn.asyncio.websocket import WebSocketClientFactory, WebSocketClientProtocol
-from ignoracle import Ignoracle, parameterize_record_info
+from libgrabsite.ignoracle import Ignoracle, parameterize_record_info
realStdoutWrite = sys.stdout.buffer.write
realStderrWrite = sys.stderr.buffer.write
View
File renamed without changes.
View
@@ -0,0 +1,28 @@
+#!/usr/bin/python3
+
+try:
+ from setuptools import setup
+except ImportError:
+ from distutils.core import setup
+
+import libgrabsite
+
+setup(
+ name="grab-site",
+ version=libgrabsite.__version__,
+ description="The archivist's web crawler: WARC output, dashboard for all crawls, dynamic ignore patterns",
+ url="https://github.com/ludios/grab-site",
+ author="Ivan Kozik",
+ author_email="ivan@ludios.org",
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "Development Status :: 3 - Alpha",
+ "Intended Audience :: End Users/Desktop",
+ "License :: OSI Approved :: MIT License",
+ "Topic :: Internet :: WWW/HTTP",
+ ],
+ scripts=["grab-site", "gs-server", "patched-wpull"],
+ packages=["libgrabsite"],
+ package_data={"libgrabsite": ["*.html"]},
+ install_requires=["wpull", "manhole", "lmdb", "autobahn", "aiohttp", "trollius"],
+)

0 comments on commit 43d8a95

Please sign in to comment.