Skip to content

Commit

Permalink
data.py downloads pre-parsed data from s3
Browse files Browse the repository at this point in the history
Parsing of the records provided by google is slow, and not necessary.
These records will not be changing, they are the basis for the
performance evaluations: they will be provided as static files on s3.
  • Loading branch information
mkocikowski committed Sep 2, 2013
1 parent 5ea57bd commit 2eb84b3
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -14,6 +14,8 @@ build/
*.zip
*.xml
*.old
*.gz
*.tgz

MANIFEST
!README.txt
92 changes: 92 additions & 0 deletions esbench/data.py
@@ -0,0 +1,92 @@
# -*- coding: UTF-8 -*-

import os.path
import logging
import argparse
import sys
# import socket

# import urlparse
# import httplib
import urllib2
# import tempfile
import gzip

# import requests


__version__ = "0.0.1"

logger = logging.getLogger(__name__)


URL = "https://s3-us-west-1.amazonaws.com/esbench/assn_%02i.json.gz"

def urls(count=1):
for n in range(count):
yield (URL % (n+1))


def download(url):

fn = os.path.basename(url)
fn = os.path.abspath(fn)
if os.path.exists(fn):
return fn

resp = urllib2.urlopen(url)

with open(fn, 'w') as f:
chunk = resp.read(2**16)
while chunk:
f.write(chunk)
chunk = resp.read(2**16)
sys.stderr.write(".")

resp.close()
return fn


def unzip(fn):

with gzip.open(fn, 'rb') as f:
for line in f:
yield(line.strip())


def feed(nocache=False):

for url in urls():
fn = download(url)
try:
for line in unzip(fn):
yield line
finally:
if nocache:
os.remove(fn)


def args_parser():
parser = argparse.ArgumentParser(description="esbench USPTO patent assignment downloader.")
parser.add_argument('-v', '--version', action='version', version=__version__)
parser.add_argument('--nocache', action='store_true', help="if set, delete downloaded data (default: %(default)s)")
return parser


def main():

logging.basicConfig(level=logging.WARNING)
args = args_parser().parse_args()

try:
for line in feed(nocache=args.nocache):
print(line)
sys.exit(0)

except IOError:
pass


if __name__ == "__main__":
main()

2 changes: 1 addition & 1 deletion esbench/uspto/assn/parse.py
Expand Up @@ -239,4 +239,4 @@ def main():
if __name__ == "__main__":
main()

# ls -1 /Volumes/MK/pat/frontside/xml/*.xml | xargs -n 1 python parse.py 2> frontside.log
# ls -1 /Volumes/MK/uspto/frontside/xml/*.xml | xargs -n 1 python parse.py 2> /Volumes/MK/uspto/frontside/frontside.log

0 comments on commit 2eb84b3

Please sign in to comment.