Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
data.py downloads pre-parsed data from s3
Parsing of the records provided by google is slow, and not necessary. These records will not be changing, they are the basis for the performance evaluations: they will be provided as static files on s3.
- Loading branch information
1 parent
5ea57bd
commit 2eb84b3
Showing
3 changed files
with
95 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ build/ | |
*.zip | ||
*.xml | ||
*.old | ||
*.gz | ||
*.tgz | ||
|
||
MANIFEST | ||
!README.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
# -*- coding: UTF-8 -*- | ||
|
||
import os.path | ||
import logging | ||
import argparse | ||
import sys | ||
# import socket | ||
|
||
# import urlparse | ||
# import httplib | ||
import urllib2 | ||
# import tempfile | ||
import gzip | ||
|
||
# import requests | ||
|
||
|
||
__version__ = "0.0.1" | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
URL = "https://s3-us-west-1.amazonaws.com/esbench/assn_%02i.json.gz" | ||
|
||
def urls(count=1): | ||
for n in range(count): | ||
yield (URL % (n+1)) | ||
|
||
|
||
def download(url): | ||
|
||
fn = os.path.basename(url) | ||
fn = os.path.abspath(fn) | ||
if os.path.exists(fn): | ||
return fn | ||
|
||
resp = urllib2.urlopen(url) | ||
|
||
with open(fn, 'w') as f: | ||
chunk = resp.read(2**16) | ||
while chunk: | ||
f.write(chunk) | ||
chunk = resp.read(2**16) | ||
sys.stderr.write(".") | ||
|
||
resp.close() | ||
return fn | ||
|
||
|
||
def unzip(fn): | ||
|
||
with gzip.open(fn, 'rb') as f: | ||
for line in f: | ||
yield(line.strip()) | ||
|
||
|
||
def feed(nocache=False): | ||
|
||
for url in urls(): | ||
fn = download(url) | ||
try: | ||
for line in unzip(fn): | ||
yield line | ||
finally: | ||
if nocache: | ||
os.remove(fn) | ||
|
||
|
||
def args_parser(): | ||
parser = argparse.ArgumentParser(description="esbench USPTO patent assignment downloader.") | ||
parser.add_argument('-v', '--version', action='version', version=__version__) | ||
parser.add_argument('--nocache', action='store_true', help="if set, delete downloaded data (default: %(default)s)") | ||
return parser | ||
|
||
|
||
def main(): | ||
|
||
logging.basicConfig(level=logging.WARNING) | ||
args = args_parser().parse_args() | ||
|
||
try: | ||
for line in feed(nocache=args.nocache): | ||
print(line) | ||
sys.exit(0) | ||
|
||
except IOError: | ||
pass | ||
|
||
|
||
if __name__ == "__main__": | ||
main() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters