Permalink
Browse files

Merge pull request #42 from mpobrien/master

Sample mapreduce job - enron email corpus
  • Loading branch information...
2 parents fb9b345 + bc20c01 commit 66c99484c5dfaefa338e9657f32257b226666694 @bwmcadams bwmcadams committed Apr 9, 2012
View
20 streaming/examples/enron/enron_map.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python
+
+import sys
+sys.path.append(".")
+
+from pymongo_hadoop import BSONMapper
+
+def mapper(documents):
+ i = 0
+ for doc in documents:
+ i = i + 1
+ if 'headers' in doc and 'To' in doc['headers'] and 'From' in doc['headers']:
+ from_field = doc['headers']['From']
+ to_field = doc['headers']['To']
+ recips = [x.strip() for x in to_field.split(',')]
+ for r in recips:
+ yield {'_id': {'f':from_field, 't':r}, 'count': 1}
+
+BSONMapper(mapper)
+print >> sys.stderr, "Done Mapping."
View
15 streaming/examples/enron/enron_reduce.py
@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+
+import sys
+sys.path.append(".")
+
+from pymongo_hadoop import BSONReducer
+
+def reducer(key, values):
+ print >> sys.stderr, "Processing from/to %s" % str(key)
+ _count = 0
+ for v in values:
+ _count += v['count']
+ return {'_id': key, 'count': _count}
+
+BSONReducer(reducer)
View
1 streaming/examples/enron/run_enron.sh
@@ -0,0 +1 @@
+hadoop jar target/mongo-hadoop-streaming-assembly*.jar -mapper examples/enron/enron_map.py -reducer examples/enron/enron_reduce.py -inputURI mongodb://127.0.0.1/enron_mail.messages -outputURI mongodb://127.0.0.1/enron_mail.output -file examples/enron/enron_map.py -file examples/enron/enron_reduce.py

0 comments on commit 66c9948

Please sign in to comment.