Permalink
Browse files

Added refreshed movieSimilarities.

  • Loading branch information...
1 parent 1f13055 commit 41feff4a4264233218006da5fb09af65d345c88f @marcelcaraciolo committed Aug 17, 2012
Showing with 136 additions and 11 deletions.
  1. +1 −0 README
  2. +40 −1 metrics.py
  3. +95 −10 moviesSimilarities.py
View
1 README
@@ -0,0 +1 @@
+python Documents/Projects/articles/mapreduce/moviesSimilarities.py Documents/Projects/articles/mapreduce/sampling.csv --python-archive Documents/Projects/articles/mapreduce/metrics.tar.gz > similarities.dat
View
41 metrics.py
@@ -15,7 +15,8 @@ def correlation(size, dot_product, rating_sum, \
rating2sum, rating_norm_squared, rating2_norm_squared):
'''
The correlation between two vectors A, B is
- cov(A, B) / (stdDev(A) * stdDev(B))
+ [n * dotProduct(A, B) - sum(A) * sum(B)] /
+ sqrt{ [n * norm(A)^2 - sum(A)^2] [n * norm(B)^2 - sum(B)^2] }
'''
numerator = size * dot_product - rating_sum * rating2sum
@@ -25,6 +26,16 @@ def correlation(size, dot_product, rating_sum, \
return (numerator / (float(denominator))) if denominator else 0.0
+def jaccard(users_in_common, total_users1, total_users2):
+ '''
+ The Jaccard Similarity between 2 two vectors
+ |Intersection(A, B)| / |Union(A, B)|
+ '''
+ union = total_users1 + total_users2 - users_in_common
+
+ return (users_in_common / (float(union))) if union else 0.0
+
+
def normalized_correlation(size, dot_product, rating_sum, \
rating2sum, rating_norm_squared, rating2_norm_squared):
'''
@@ -39,6 +50,34 @@ def normalized_correlation(size, dot_product, rating_sum, \
return (similarity + 1.0) / 2.0
+def cosine(dot_product, rating_norm_squared, rating2_norm_squared):
+ '''
+ The cosine between two vectors A, B
+ dotProduct(A, B) / (norm(A) * norm(B))
+ '''
+ numerator = dot_product
+ denominator = rating_norm_squared * rating2_norm_squared
+
+ return (numerator / (float(denominator))) if denominator else 0.0
+
+
+def regularized_correlation(size, dot_product, rating_sum, \
+ rating2sum, rating_norm_squared, rating2_norm_squared,
+ virtual_cont, prior_correlation):
+ '''
+ The Regularized Correlation between two vectors A, B
+
+ RegularizedCorrelation = w * ActualCorrelation + (1 - w) * PriorCorrelation
+ where w = # actualPairs / (# actualPairs + # virtualPairs).
+ '''
+ unregularizedCorrelation = correlation(size, dot_product, rating_sum, \
+ rating2sum, rating_norm_squared, rating2_norm_squared)
+
+ w = size / float(size + virtual_cont)
+
+ return w * unregularizedCorrelation + (1.0 - w) * prior_correlation
+
+
def combinations(iterable, r):
"""
Implementation of itertools combinations method. Re-implemented here because
View
105 moviesSimilarities.py
@@ -14,13 +14,20 @@
__author__ = 'Marcel Caraciolo <caraciol@gmail.com>'
from mrjob.job import MRJob
-from metrics import correlation, normalized_correlation
+from metrics import correlation
+from metrics import jaccard, cosine, regularized_correlation
+from math import sqrt
+
try:
from itertools import combinations
except ImportError:
from metrics import combinations
+PRIOR_COUNT = 10
+PRIOR_CORRELATION = 0
+
+
class MoviesSimilarities(MRJob):
def steps(self):
@@ -32,8 +39,17 @@ def steps(self):
def group_by_user_rating(self, key, line):
"""
- Mapper: send score from a single movie to
- other movies
+ Emit the user_id and group by their ratings (item and rating)
+
+ 17 70,3
+ 35 21,1
+ 49 19,2
+ 49 21,1
+ 49 70,4
+ 87 19,1
+ 87 21,2
+ 98 19,2
+
"""
user_id, item_id, rating = line.split('|')
#yield (item_id, int(rating)), user_id
@@ -42,6 +58,17 @@ def group_by_user_rating(self, key, line):
#yield (user_id, item_id), int(rating)
def count_ratings_users_freq(self, user_id, values):
+ """
+ For each user, emit a row containing their "postings"
+ (item,rating pairs)
+ Also emit user rating sum and count for use later steps.
+
+ 17 1,3,(70,3)
+ 35 1,1,(21,1)
+ 49 3,7,(19,2 21,1 70,4)
+ 87 2,3,(19,1 21,2)
+ 98 1,2,(19,2)
+ """
item_count = 0
item_sum = 0
final = []
@@ -53,6 +80,21 @@ def count_ratings_users_freq(self, user_id, values):
yield user_id, (item_count, item_sum, final)
def pairwise_items(self, user_id, values):
+ '''
+ The output drops the user from the key entirely, instead it emits
+ the pair of items as the key:
+
+ 19,21 2,1
+ 19,70 2,4
+ 21,70 1,4
+ 19,21 1,2
+
+ This mapper is the main performance bottleneck. One improvement
+ would be to create a java Combiner to aggregate the
+ outputs by key before writing to hdfs, another would be to use
+ a vector format and SequenceFiles instead of streaming text
+ for the matrix data.
+ '''
item_count, item_sum, ratings = values
#print item_count, item_sum, [r for r in combinations(ratings, 2)]
#bottleneck at combinations
@@ -61,6 +103,19 @@ def pairwise_items(self, user_id, values):
(item1[1], item2[1])
def calculate_similarity(self, pair_key, lines):
+ '''
+ Sum components of each corating pair across all users who rated both
+ item x and item y, then calculate pairwise pearson similarity and
+ corating counts. The similarities are normalized to the [0,1] scale
+ because we do a numerical sort.
+
+ 19,21 0.4,2
+ 21,19 0.4,2
+ 19,70 0.6,1
+ 70,19 0.6,1
+ 21,70 0.1,1
+ 70,21 0.1,1
+ '''
sum_xx, sum_xy, sum_yy, sum_x, sum_y, n = (0.0, 0.0, 0.0, 0.0, 0.0, 0)
item_pair, co_ratings = pair_key, lines
item_xname, item_yname = item_pair
@@ -71,20 +126,50 @@ def calculate_similarity(self, pair_key, lines):
sum_y += item_y
sum_x += item_x
n += 1
- similarity = normalized_correlation(n, sum_xy, sum_x, sum_y, \
- sum_xx, sum_yy)
- yield (item_xname, item_yname), (similarity, n)
+
+ corr_sim = correlation(n, sum_xy, sum_x, \
+ sum_y, sum_xx, sum_yy)
+
+ reg_corr_sim = regularized_correlation(n, sum_xy, sum_x, \
+ sum_y, sum_xx, sum_yy, PRIOR_COUNT, PRIOR_CORRELATION)
+
+ cos_sim = cosine(sum_xy, sqrt(sum_xx), sqrt(sum_yy))
+
+ jaccard_sim = 0.0
+
+ yield (item_xname, item_yname), (corr_sim, \
+ cos_sim, reg_corr_sim, jaccard_sim, n)
def calculate_ranking(self, item_keys, values):
- similarity, n = values
+ '''
+ Emit items with similarity in key for ranking:
+
+ 19,0.4 70,1
+ 19,0.6 21,2
+ 21,0.6 19,2
+ 21,0.9 70,1
+ 70,0.4 19,1
+ 70,0.9 21,1
+
+ '''
+ corr_sim, cos_sim, reg_corr_sim, jaccard_sim, n = values
item_x, item_y = item_keys
if int(n) > 0:
- yield (item_x, similarity), (item_y, n)
+ yield (item_x, corr_sim, cos_sim, reg_corr_sim, jaccard_sim), \
+ (item_y, n)
def top_similar_items(self, key_sim, similar_ns):
- item_x, similarity = key_sim
+ '''
+ For each item emit K closest items in comma separated file:
+
+ De La Soul;A Tribe Called Quest;0.6;1
+ De La Soul;2Pac;0.4;2
+
+ '''
+ item_x, corr_sim, cos_sim, reg_corr_sim, jaccard_sim = key_sim
for item_y, n in similar_ns:
- print '%s;%s;%f;%d' % (item_x, item_y, similarity, n)
+ print '%s;%s;%f;%f;%f;%f;%d' % (item_x, item_y, corr_sim, cos_sim,
+ reg_corr_sim, jaccard_sim, n)
if __name__ == '__main__':
MoviesSimilarities.run()

0 comments on commit 41feff4

Please sign in to comment.