Permalink
Browse files

First Commit

  • Loading branch information...
mlmiller committed Jan 8, 2011
1 parent bb03dc3 commit eb7e1cb0d0ede7706846a1996699069b340ba169
Binary file not shown.
View
@@ -0,0 +1,20 @@
+This is a simple example of using Cloudant's mapreduce for textual and numerical data analysis.
+
+* Requirements
+ python (2.5+?)
+ couchdkit: http://couchdbkit.org/download.html (=> sudo easy_install -U couchdbkit)
+
+* Input data:
+ AviationData.txt.gz taken from data.gov
+
+* _design/:
+ a directory of map.js and reduce.js files that get mapped to a design document and uploaded
+
+* upload.py
+ a python script to create the database, upload the data in AvaiationData.txt.gz, and build the mapreduce views
+ execute via:
+ > python upload.py AviationData.txt.gz 'http://<username>:<password>@<username>.cloudant.com' planes
+
+* analyze.sh
+ a bash script to demonstrate example queries of our prebuilt mapreduce views for data analysis
+
@@ -0,0 +1,9 @@
+function(doc) {
+ var then = new Date(Date.parse(doc['Event Date']));
+ var fatalities = 0;
+ if (doc['Total Fatal Injuries']!="")
+ {
+ fatalities = parseInt(doc['Total Fatal Injuries']);
+ }
+ emit([then.getFullYear(), then.getMonth()], [1, fatalities]);
+}
@@ -0,0 +1 @@
+_sum
@@ -0,0 +1,4 @@
+function(doc) {
+ var then = new Date(Date.parse(doc['Event Date']));
+ emit(doc.Make, 1);
+}
@@ -0,0 +1 @@
+_sum
View
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View
@@ -0,0 +1,60 @@
+from couchdbkit import Server, Database
+from couchdbkit.loaders import FileSystemDocsLoader
+import ROOT
+import sys
+
+if __name__=='__main__':
+
+ uri = sys.argv[1]
+ dbname = sys.argv[2]
+
+ print 'Analyze contents of %s/%s' % (uri, dbname)
+
+ # #connect to the db
+ cloudant = Server(uri)
+ db = cloudant.get_or_create_db(dbname)
+ print db.info()
+
+ #let's plot the number of accidents and fatalities vs time
+ nt = ROOT.TNtuple('nt','Aviation Accident Ntuple','year:month:accidents:fatalities')
+
+
+ for row in db.view('example/date',group=True,stale='ok'):
+ year = row['key'][0]
+ month = row['key'][1]
+ accidents = row['value'][0]
+ fatalities = row['value'][1]
+ nt.Fill(year,month,accidents,fatalities)
+ #
+
+ #and plot
+ c = ROOT.TCanvas('c')
+ c.SetFillColor(0)
+ ROOT.gStyle.SetOptStat(0)
+
+ nt.SetMarkerStyle(20)
+ nt.SetMarkerColor(1)
+ nt.SetLineColor(1)
+ nt.Draw('year>>+hacc(29,1981.5, 2010.5)','accidents','lp')
+ hacc = ROOT.gROOT.Get('hacc')
+
+ nt.SetMarkerStyle(24)
+ nt.SetMarkerColor(2)
+ nt.SetLineColor(2)
+ nt.Draw('year>>+hfat(29,1981.5,2010.5)','fatalities','lpsame')
+ hfat = ROOT.gROOT.Get('hfat')
+
+ hacc.SetTitle('FAA Data by Year')
+ hacc.SetXTitle('Year')
+
+ #now find the year with the most accidents and inspect those 5 individually. This is a job for search
+
+ leg = ROOT.TLegend(0.5,0.65, 0.85, 0.8)
+ leg.SetFillColor(0)
+ leg.AddEntry(hacc,'Accidents','lp')
+ leg.AddEntry(hfat,'Fatalities','lp')
+ leg.Draw('same')
+
+
+ c.SaveAs('accidents.png')
+
View
@@ -0,0 +1,52 @@
+from couchdbkit import Server, Database
+from couchdbkit.loaders import FileSystemDocsLoader
+from csv import DictReader
+import time, sys
+
+if __name__=='__main__':
+
+
+ fname = sys.argv[1]
+ uri = sys.argv[2]
+ dbname = sys.argv[3]
+
+ print 'Upload contents of %s to %s/%s' % (fname, uri, dbname)
+
+ # #connect to the db
+ cloudant = Server(uri)
+ db = cloudant.get_or_create_db(dbname)
+ print db.info()
+
+ #sync the views for prebuilt indices
+ loader = FileSystemDocsLoader('_design/')
+ loader.sync(db, verbose=True)
+
+
+ #loop on file for upload
+ reader = DictReader(open(fname),delimiter='|')
+
+ docs = list()
+ checkpoint = 1000
+ n=0
+ start = time.time()
+
+ for doc in reader:
+ n+=1
+ docs.append(doc)
+ # print doc
+ if (len(docs)%checkpoint==0):
+ print 'upload:\t%i' % n
+ db.bulk_save(docs)
+ del docs
+ docs = list()
+
+ #don't forget the last batch
+ db.bulk_save(docs)
+
+ #print summary statistics
+
+ delta = time.time() - start
+ rate = float(checkpoint)/float(delta)
+ ndocs = n
+ print 'uploaded: %i docs in: %i seconds for a rate: %f docs/sec' % (ndocs, delta,rate)
+

0 comments on commit eb7e1cb

Please sign in to comment.