In [28]:
# A dictionary of movie critics and their ratings of a small
# set of movies
critics = {'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
						 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
						 'The Night Listener': 3.0},
		   'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
							'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
							'You, Me and Dupree': 3.5},
		   'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
								'Superman Returns': 3.5, 'The Night Listener': 4.0},
		   'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
							'The Night Listener': 4.5, 'Superman Returns': 4.0,
							'You, Me and Dupree': 2.5},
		   'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
							'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
							'You, Me and Dupree': 2.0},
		   'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
							 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
		   'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0, 'Superman Returns': 4.0}}

In [36]:
from math import sqrt


# Return a distance-based similarity score for person1 and person2
def sim_distance(prefs, person1, person2):
	# Get the list of shared_items
	si = {}
	for item in prefs[person1]:
		if item in prefs[person2]:
			si[item] = 1

	# if they have no ratings in common, return 0
	if len(si) == 0:
		return 0

	# Add up the squares of all the differences
	sum_of_squares = sum([pow(prefs[person1][item] - prefs[person2][item], 2)
						  for item in si])
    
	return 1 / (1 + sqrt(sum_of_squares))

In [30]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
  # Get the list of mutually rated items
  si={}
  for item in prefs[p1]: 
    if item in prefs[p2]: si[item]=1

  # if they are no ratings in common, return 0
  if len(si)==0: return 0

  # Sum calculations
  n=len(si)
  
  # Sums of all the preferences
  sum1=sum([prefs[p1][it] for it in si])
  sum2=sum([prefs[p2][it] for it in si])
  
  # Sums of the squares
  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])	
  
  # Sum of the products
  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
  
  # Calculate r (Pearson score)
  num=pSum-(sum1*sum2/n)
  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  if den==0: return 0

  r=num/den

  return r

In [31]:
# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
  scores=[(similarity(prefs,person,other),other) 
                  for other in prefs if other!=person]
  scores.sort()
  scores.reverse()
  return scores[0:n]

In [32]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: 
        continue
    sim=similarity(prefs,person,other)

    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]: 
      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        totals.setdefault(item,0)
        totals[item]+=prefs[other][item]*sim
        # Sum of similarities
        simSums.setdefault(item,0)
        simSums[item]+=sim

  # Create the normalized list
  rankings=[(total/simSums[item],item) for item,total in totals.items()]

  # Return the sorted list
  rankings.sort()
  rankings.reverse()
  return rankings

In [33]:
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      result.setdefault(item,{})
      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result

In [84]:
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they
  # are most similar to.
  result={}

  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: 
        print ("%d / %d" % (c,len(itemPrefs)))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result

def calculateSimilarUser(prefs,n=10):
    result={}
    
    c = 0
    for user in prefs:
        c+=1
        if c%100==0:
            print ("%d / %d" % (c,len(prefs)))
        
        scores = topMatches(prefs, user, n=n, similarity=sim_distance)
        result[user] = scores
    return result

In [38]:
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):

    # Loop over items similar to this one
    for (similarity,item2) in itemMatch[item]:

      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      scores.setdefault(item2,0)
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim.setdefault(item2,0)
      totalSim[item2]+=similarity

  # Divide each total score by total weighting to get an average
  rankings=[(score/totalSim[item],item) for item,score in scores.items( )]

  # Return the rankings from highest to lowest
  rankings.sort( )
  rankings.reverse( )
  return rankings

def getRecommendedUsers(prefs, userMatch, user):
    

In [42]:
itemsim = calculateSimilarItems(critics)
itemsim
#getRecommendedItems(critics, itemsim, 'Toby')

{'Just My Luck': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'You, Me and Dupree'),
  (0.2989350844248255, 'The Night Listener'),
  (0.2553967929896867, 'Snakes on a Plane'),
  (0.20799159651347807, 'Superman Returns')],
 'Lady in the Water': [(0.4494897427831781, 'You, Me and Dupree'),
  (0.38742588672279304, 'The Night Listener'),
  (0.3483314773547883, 'Snakes on a Plane'),
  (0.3483314773547883, 'Just My Luck'),
  (0.2402530733520421, 'Superman Returns')],
 'Snakes on a Plane': [(0.3483314773547883, 'Lady in the Water'),
  (0.32037724101704074, 'The Night Listener'),
  (0.3090169943749474, 'Superman Returns'),
  (0.2553967929896867, 'Just My Luck'),
  (0.1886378647726465, 'You, Me and Dupree')],
 'Superman Returns': [(0.3090169943749474, 'Snakes on a Plane'),
  (0.252650308587072, 'The Night Listener'),
  (0.2402530733520421, 'Lady in the Water'),
  (0.20799159651347807, 'Just My Luck'),
  (0.1918253663634734, 'You, Me and Dupree')],
 'The Night Listener': [

In [58]:
def loadMovieLens(path='../data/movielens'):
    # Get movie titles
    movies = {}
    for line in open(path + '/u.item',encoding = "ISO-8859-1"):
        (id, title) = line.split('|')[0:2]
        movies[id] = title

    # Load data
    prefs = {}
    for line in open(path + '/u.data',encoding = "ISO-8859-1"):
        (user, movieid, rating, ts) = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs

In [60]:
prefs = loadMovieLens()

{'1': {'101 Dalmatians (1996)': 2.0,
  '12 Angry Men (1957)': 5.0,
  '20,000 Leagues Under the Sea (1954)': 3.0,
  '2001: A Space Odyssey (1968)': 4.0,
  'Abyss, The (1989)': 3.0,
  'Ace Ventura: Pet Detective (1994)': 3.0,
  'Air Bud (1997)': 1.0,
  'Akira (1988)': 4.0,
  'Aladdin (1992)': 4.0,
  'Alien (1979)': 5.0,
  'Aliens (1986)': 5.0,
  'All Dogs Go to Heaven 2 (1996)': 1.0,
  'Amadeus (1984)': 5.0,
  'Angels and Insects (1995)': 4.0,
  "Antonia's Line (1995)": 5.0,
  'Apocalypse Now (1979)': 3.0,
  'Apollo 13 (1995)': 4.0,
  'Aristocats, The (1970)': 2.0,
  'Army of Darkness (1993)': 4.0,
  'Austin Powers: International Man of Mystery (1997)': 4.0,
  'Babe (1995)': 1.0,
  'Back to the Future (1985)': 5.0,
  'Bad Boys (1995)': 2.0,
  'Basic Instinct (1992)': 3.0,
  'Batman & Robin (1997)': 1.0,
  'Batman Forever (1995)': 1.0,
  'Batman Returns (1992)': 1.0,
  'Beavis and Butt-head Do America (1996)': 3.0,
  'Bedknobs and Broomsticks (1971)': 2.0,
  'Belle de jour (1967)': 3.0,
 

In [62]:
getRecommendations(prefs, '87')[:30]

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.538723693474813, 'Leading Man, The (1996)'),
 (4.535081339106104, 'Mrs. Dalloway (1997)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747075, 'Casablanca (1942)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.4939677554284385, 'Dangerous Beauty (1998)'),
 (4.485151301801342, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.4632874612902205, 'Wrong Trousers, The (1993)'),
 (4.4509794369

In [87]:
#itemsim = calculateSimilarItems(prefs, n=50)
usersim = calculateSimilarUser(prefs, n=5)

100 / 943
200 / 943
300 / 943
400 / 943
500 / 943
600 / 943
700 / 943
800 / 943
900 / 943


In [86]:
usersim

{'1': [(1.0, '812'),
  (1.0, '418'),
  (1.0, '155'),
  (0.5, '729'),
  (0.5, '631'),
  (0.5, '351'),
  (0.5, '309'),
  (0.5, '273'),
  (0.4142135623730951, '876'),
  (0.4142135623730951, '485'),
  (0.4142135623730951, '111'),
  (0.36602540378443865, '687'),
  (0.36602540378443865, '105'),
  (0.3333333333333333, '895'),
  (0.3333333333333333, '811'),
  (0.3333333333333333, '685'),
  (0.3333333333333333, '531'),
  (0.3333333333333333, '39'),
  (0.3333333333333333, '356'),
  (0.3333333333333333, '341'),
  (0.3333333333333333, '282'),
  (0.3333333333333333, '260'),
  (0.3333333333333333, '107'),
  (0.3090169943749474, '9'),
  (0.3090169943749474, '842'),
  (0.3090169943749474, '820'),
  (0.3090169943749474, '696'),
  (0.3090169943749474, '547'),
  (0.3090169943749474, '520'),
  (0.3090169943749474, '516'),
  (0.3090169943749474, '433'),
  (0.3090169943749474, '400'),
  (0.3090169943749474, '359'),
  (0.3090169943749474, '34'),
  (0.3090169943749474, '241'),
  (0.3090169943749474, '166'),
 

In [64]:
#getRecommendedItems(prefs, itemsim, '87')[:30]


[(5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, 'Vertigo (1958)'),
 (5.0, 'Usual Suspects, The (1995)'),
 (5.0, 'Toy Story (1995)'),
 (5.0, 'Titanic (1997)'),
 (5.0, 'Sword in the Stone, The (1963)'),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Sling Blade (1996)'),
 (5.0, 'Silence of the Lambs, The (1991)'),
 (5.0, 'Shining, The (1980)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Sense and Sensibility (1995)'),
 (5.0, 'Scream (1996)'),
 (5.0, 'Rumble in the Bronx (1995)'),
 (5.0, 'Rock, The (1996)'),
 (5.0, 'Robin Hood: Prince of Thieves (1991)'),
 (5.0, 'Reservoir Dogs (1992)'),
 (5.0, 'Police Story 4: Project S (Chao ji ji hua) (1993)'),
 (5.0, 'House of the Spirits, The (1993)'),
 (5.0, 'Fresh (1994)'),
 (5.0, 'Denise Calls Up (1995)'),
 (5.0, 'Day the Sun Turned Cold, The (Tianguo niezi) (1994)'),
 (5.0, 'Before the Rain (Pred dozhdot) (1994)'),
 (5.0, 'Assignment, The (1997)'),
 (5.0, '1-900 (1994)'),
 (4.875, "Ed's Next Move (1996)"),
 (4.833333333333333, 'Anna (1996)'),
 (4.8, 'Dark City 

In [70]:
def sim_tanimoto(prefs, p1, p2):
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
            
    if len(si) == 0:
        return 0
    
    sum1 = sum([prefs[p1][it]*prefs[p2][it] for it in si])
    sum2 = sum([pow(prefs[p1][it],2) for it in prefs[p1]]) + sum(
        [pow(prefs[p2][it],2) for it in prefs[p2]]) + sum1
        
    return sum1/sum2