Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Added gzip support to upload.py #1

Open
wants to merge 2 commits into from

1 participant

@tapichu

Hi,

I just modified the upload.py script a bit so it can handle both compressed and uncompressed files.

I guess it would've been easier to just modify the README and remove the .gz extension from the upload.py example, but where's the fun in that?

Anyway, thanks for the MapReduce post.

Cheers

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Commits on Jan 15, 2011
  1. @tapichu

    Removed unused import

    tapichu authored
  2. @tapichu
This page is out of date. Refresh to see the latest.
Showing with 35 additions and 34 deletions.
  1. +35 −34 aviation/upload.py
View
69 aviation/upload.py
@@ -1,52 +1,53 @@
-from couchdbkit import Server, Database
+from couchdbkit import Server
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
-import time, sys
+import time, sys, gzip
+
+def upload(db, reader, checkpoint = 1000):
+ docs = list()
+ ndocs = 0
+ start = time.time()
+
+ for doc in reader:
+ ndocs += 1
+ docs.append(doc)
+ # print doc
+ if (len(docs) % checkpoint == 0):
+ print 'upload:\t%i' % ndocs
+ db.bulk_save(docs)
+ del docs
+ docs = list()
+
+ # don't forget the last batch
+ db.bulk_save(docs)
+
+ #print summary statistics
+ delta = time.time() - start
+ rate = float(checkpoint) / float(delta)
+ print 'uploaded: %i docs in: %i seconds for a rate: %f docs/sec' % (ndocs, delta, rate)
if __name__=='__main__':
-
-
+
fname = sys.argv[1]
uri = sys.argv[2]
dbname = sys.argv[3]
-
+
print 'Upload contents of %s to %s/%s' % (fname, uri, dbname)
-
+
# #connect to the db
cloudant = Server(uri)
db = cloudant.get_or_create_db(dbname)
print db.info()
-
+
#sync the views for prebuilt indices
loader = FileSystemDocsLoader('_design/')
loader.sync(db, verbose=True)
-
#loop on file for upload
- reader = DictReader(open(fname),delimiter='|')
-
- docs = list()
- checkpoint = 1000
- n=0
- start = time.time()
-
- for doc in reader:
- n+=1
- docs.append(doc)
- # print doc
- if (len(docs)%checkpoint==0):
- print 'upload:\t%i' % n
- db.bulk_save(docs)
- del docs
- docs = list()
-
- #don't forget the last batch
- db.bulk_save(docs)
-
- #print summary statistics
-
- delta = time.time() - start
- rate = float(checkpoint)/float(delta)
- ndocs = n
- print 'uploaded: %i docs in: %i seconds for a rate: %f docs/sec' % (ndocs, delta,rate)
+ try:
+ reader = DictReader(gzip.open(fname, 'rb'), delimiter = '|')
+ upload(db, reader)
+ except IOError:
+ reader = DictReader(open(fname), delimiter = '|')
+ upload(db, reader)
Something went wrong with that request. Please try again.