Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Added new files to Similarities

  • Loading branch information...
commit 1bb15fea06590f209707d7be644ca6b8526f8f56 1 parent d9fa499
@marcelcaraciolo authored
Showing with 201,769 additions and 0 deletions.
  1. +35 −0 moviesSimilarities.py
  2. +28 −0 movies_count.py
  3. +24 −0 normalize.py
  4. +100,000 −0 ratings.csv
  5. +100,000 −0 u.data
  6. +1,682 −0 u.item
View
35 moviesSimilarities.py
@@ -0,0 +1,35 @@
+#-*-coding: utf-8 -*-
+
+'''
+ Given a dataset of movies and their ratings by different
+ users, how can we compute the similarity between pairs of
+ movies?
+
+ This module computes similarities between movies
+ by representing each movie as a vector of ratings and
+ computing similarity scores over these vectors.
+
+
+'''
+
+__author__ = 'Marcel Caraciolo <caraciol@gmail.com>'
+
+from mrjob.job import MRJob
+
+
+class MoviesSimilarities(MRJob):
+
+ def mapper(self, key, line):
+ """
+ Mapper: send score from a single movie to
+ other movies
+ """
+ for user_id, item_id, rating in line.split(';'):
+ print user_id, item_id, rating
+
+ def reducer(self, _, values):
+ pass
+
+
+if __name__ == '__main__':
+ MoviesSimilarities.run()
View
28 movies_count.py
@@ -0,0 +1,28 @@
+#-*-coding: utf-8 -*-
+
+'''
+This module computes the number of movies rated by each
+user.
+
+'''
+
+__author__ = 'Marcel Caraciolo <caraciol@gmail.com>'
+
+from mrjob.job import MRJob
+
+
+class MoviesCount(MRJob):
+
+ def mapper(self, key, line):
+ """
+ Mapper: send score from a single movie to
+ other movies
+ """
+ user_id, item_id, rating = line.split('|')
+ yield(item_id, 1)
+
+ def reducer(self, movie, values):
+ yield(movie, sum(values))
+
+if __name__ == '__main__':
+ MoviesCount.run()
View
24 normalize.py
@@ -0,0 +1,24 @@
+
+def items(input):
+ fh = open(input)
+ d = {}
+ for line in fh:
+ items_info = line.split('|')
+ d[int(items_info[0])] = items_info[1]
+
+ return d
+
+
+def normalize(input, item_input, output):
+ fh = open(input)
+ fo = open(output, 'w')
+
+ info_items = items(item_input)
+ for line in fh:
+ user_id, item_id, rating, timestamp = line.split('\t')
+ fo.write('|'.join([user_id, info_items[int(item_id)], rating]) + '\n')
+
+ fo.close()
+ fh.close()
+
+normalize('u.data', 'u.item', 'ratings.csv')
View
100,000 ratings.csv
100,000 additions, 0 deletions not shown
View
100,000 u.data
100,000 additions, 0 deletions not shown
View
1,682 u.item
1,682 additions, 0 deletions not shown
Please sign in to comment.
Something went wrong with that request. Please try again.