Browse files

Merge pull request #1 from irskep/random_improvements

Various improvements to docs and mrjob 0.4 compatibility
  • Loading branch information...
2 parents ce3e70e + 330fb47 commit bdbc5a65ca25d7ad7f0125e9733ef4e0b2288280 @marcelcaraciolo committed Aug 24, 2012
View
8 README
@@ -1 +1,7 @@
-python Documents/Projects/articles/mapreduce/moviesSimilarities.py Documents/Projects/articles/mapreduce/sampling.csv --python-archive Documents/Projects/articles/mapreduce/metrics.tar.gz > similarities.dat
+For more information, read the blog post:
+http://aimotion.blogspot.com/2012/08/introduction-to-recommendations-with.html
+
+# Python files that aren't in the MRJob file itself need to be uploaded
+# separately, so put them in a tarball and use --python-archive
+> tar -cvf metrics.tar.gz metrics.py
+> python moviesSimilarities.py data/sampling.csv --python-archive data/metrics.tar.gz
View
0 ratings.csv → data/ratings.csv
File renamed without changes.
View
0 ratings_books.csv → data/ratings_books.csv
File renamed without changes.
View
0 sample_songs.csv → data/sample_songs.csv
File renamed without changes.
View
0 sampling.csv → data/sampling.csv
File renamed without changes.
View
29 moviesSimilarities.py
@@ -15,7 +15,7 @@
from mrjob.job import MRJob
from metrics import correlation
-from metrics import jaccard, cosine, regularized_correlation
+from metrics import cosine, regularized_correlation
from math import sqrt
try:
@@ -28,14 +28,26 @@
PRIOR_CORRELATION = 0
+class SemicolonValueProtocol(object):
+
+ # don't need to implement read() since we aren't using it
+
+ def write(self, key, values):
+ return ';'.join(str(v) for v in values)
+
+
class MoviesSimilarities(MRJob):
+ OUTPUT_PROTOCOL = SemicolonValueProtocol
+
def steps(self):
- return [self.mr(self.group_by_user_rating,
- self.count_ratings_users_freq),
- self.mr(self.pairwise_items, self.calculate_similarity),
- self.mr(self.calculate_ranking, self.top_similar_items)
- ]
+ return [
+ self.mr(mapper=self.group_by_user_rating,
+ reducer=self.count_ratings_users_freq),
+ self.mr(mapper=self.pairwise_items,
+ reducer=self.calculate_similarity),
+ self.mr(mapper=self.calculate_ranking,
+ reducer=self.top_similar_items)]
def group_by_user_rating(self, key, line):
"""
@@ -168,8 +180,9 @@ def top_similar_items(self, key_sim, similar_ns):
'''
item_x, corr_sim, cos_sim, reg_corr_sim, jaccard_sim = key_sim
for item_y, n in similar_ns:
- print '%s;%s;%f;%f;%f;%f;%d' % (item_x, item_y, corr_sim, cos_sim,
- reg_corr_sim, jaccard_sim, n)
+ yield None, (item_x, item_y, corr_sim, cos_sim, reg_corr_sim,
+ jaccard_sim, n)
+
if __name__ == '__main__':
MoviesSimilarities.run()
View
27 vectorSimilarities.py
@@ -64,15 +64,26 @@ def combinations(iterable, r):
MIN_INTERSECTION = 0
+class SemicolonValueProtocol(object):
+
+ # don't need to implement read() since we aren't using it
+
+ def write(self, key, values):
+ return ';'.join(str(v) for v in values)
+
+
class VectorSimilarities(MRJob):
+ OUTPUT_PROTOCOL = SemicolonValueProtocol
+
def steps(self):
- return [self.mr(self.input,
- self.group_by_user_rating),
- self.mr(None, self.count_ratings_users_freq),
- self.mr(self.pairwise_items, self.calculate_similarity),
- self.mr(self.calculate_ranking, self.top_similar_items)
- ]
+ return [
+ self.mr(mapper=self.input, reducer=self.group_by_user_rating),
+ self.mr(reducer=self.count_ratings_users_freq),
+ self.mr(mapper=self.pairwise_items,
+ reducer=self.calculate_similarity),
+ self.mr(mapper=self.calculate_ranking,
+ reducer=self.top_similar_items)]
def configure_options(self):
super(VectorSimilarities, self).configure_options()
@@ -243,8 +254,8 @@ def top_similar_items(self, key_sim, similar_ns):
'''
item_x, corr_sim, cos_sim, reg_corr_sim, jaccard_sim = key_sim
for item_y, n in similar_ns:
- yield '%s;%s;%f;%f;%f;%f;%d' % (item_x, item_y, corr_sim, cos_sim,
- reg_corr_sim, jaccard_sim, n), None
+ yield None, (item_x, item_y, corr_sim, cos_sim, reg_corr_sim,
+ jaccard_sim, n)
def correlation(size, dot_product, rating_sum, \

0 comments on commit bdbc5a6

Please sign in to comment.