# 提供推荐

## 基于用户的协作型过滤
核心思想：找出和自己品味相近的一群人

In [1]:
from pprint import pprint
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
pprint(critics)

{'Claudia Puig': {'Just My Luck': 3.0,
                  'Snakes on a Plane': 3.5,
                  'Superman Returns': 4.0,
                  'The Night Listener': 4.5,
                  'You, Me and Dupree': 2.5},
 'Gene Seymour': {'Just My Luck': 1.5,
                  'Lady in the Water': 3.0,
                  'Snakes on a Plane': 3.5,
                  'Superman Returns': 5.0,
                  'The Night Listener': 3.0,
                  'You, Me and Dupree': 3.5},
 'Jack Matthews': {'Lady in the Water': 3.0,
                   'Snakes on a Plane': 4.0,
                   'Superman Returns': 5.0,
                   'The Night Listener': 3.0,
                   'You, Me and Dupree': 3.5},
 'Lisa Rose': {'Just My Luck': 3.0,
               'Lady in the Water': 2.5,
               'Snakes on a Plane': 3.5,
               'Superman Returns': 3.5,
               'The Night Listener': 3.0,
               'You, Me and Dupree': 2.5},
 'Michael Phillips': {'Lady in the Water': 2.5,
    

In [2]:
from math import sqrt

# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  si={}
  for item in prefs[person1]: 
    if item in prefs[person2]: si[item]=1

  # if they have no ratings in common, return 0
  if len(si)==0: return 0

  # Add up the squares of all the differences
  sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
                      for item in prefs[person1] if item in prefs[person2]])

  return 1/(1+sum_of_squares)

In [3]:
sim_distance(critics,'Lisa Rose','Gene Seymour')
names = list(critics.keys())
for person1 in names:
    for person2 in names:
        if person1 == person2:
            break
        distance = sim_distance(critics,person1,person2)
        print('%f : %s and %s.' %(distance,person1,person2))

0.210526 : Gene Seymour and Michael Phillips.
0.444444 : Lisa Rose and Michael Phillips.
0.148148 : Lisa Rose and Gene Seymour.
0.181818 : Jack Matthews and Michael Phillips.
0.800000 : Jack Matthews and Gene Seymour.
0.210526 : Jack Matthews and Lisa Rose.
0.285714 : Mick LaSalle and Michael Phillips.
0.129032 : Mick LaSalle and Gene Seymour.
0.333333 : Mick LaSalle and Lisa Rose.
0.137931 : Mick LaSalle and Jack Matthews.
0.571429 : Claudia Puig and Michael Phillips.
0.133333 : Claudia Puig and Gene Seymour.
0.285714 : Claudia Puig and Lisa Rose.
0.181818 : Claudia Puig and Jack Matthews.
0.173913 : Claudia Puig and Mick LaSalle.
0.285714 : Toby and Michael Phillips.
0.108108 : Toby and Gene Seymour.
0.222222 : Toby and Lisa Rose.
0.117647 : Toby and Jack Matthews.
0.307692 : Toby and Mick LaSalle.
0.235294 : Toby and Claudia Puig.


In [4]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
  # Get the list of mutually rated items
  si={}
  for item in prefs[p1]: 
    if item in prefs[p2]: si[item]=1

  # if they are no ratings in common, return 0
  if len(si)==0: return 0

  # Sum calculations
  n=len(si)
  
  # Sums of all the preferences
  sum1=sum([prefs[p1][it] for it in si])
  sum2=sum([prefs[p2][it] for it in si])
  
  # Sums of the squares
  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])	
  
  # Sum of the products
  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
  
  # Calculate r (Pearson score)
  num=pSum-(sum1*sum2/n)
  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  if den==0: return 0

  r=num/den

  return r

In [5]:
for person1 in names:
    for person2 in names:
        if person1 == person2:
            break
        distance = sim_pearson(critics,person1,person2)
        print('%f : %s and %s.' %(distance,person1,person2))

0.204598 : Gene Seymour and Michael Phillips.
0.404520 : Lisa Rose and Michael Phillips.
0.396059 : Lisa Rose and Gene Seymour.
0.134840 : Jack Matthews and Michael Phillips.
0.963796 : Jack Matthews and Gene Seymour.
0.747018 : Jack Matthews and Lisa Rose.
-0.258199 : Mick LaSalle and Michael Phillips.
0.411765 : Mick LaSalle and Gene Seymour.
0.594089 : Mick LaSalle and Lisa Rose.
0.211289 : Mick LaSalle and Jack Matthews.
1.000000 : Claudia Puig and Michael Phillips.
0.314970 : Claudia Puig and Gene Seymour.
0.566947 : Claudia Puig and Lisa Rose.
0.028571 : Claudia Puig and Jack Matthews.
0.566947 : Claudia Puig and Mick LaSalle.
-1.000000 : Toby and Michael Phillips.
0.381246 : Toby and Gene Seymour.
0.991241 : Toby and Lisa Rose.
0.662849 : Toby and Jack Matthews.
0.924473 : Toby and Mick LaSalle.
0.893405 : Toby and Claudia Puig.


In [6]:
# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
  scores=[(similarity(prefs,person,other),other) 
                  for other in prefs if other!=person]
  scores.sort()
  scores.reverse()
  return scores[0:n]

# My own funtion
def myTopMatches(data,similarity=sim_pearson):
    names = list(data.keys())
    results = {}
    
    for person1 in names:
        for person2 in names:
            if person1 == person2:
                break
            results[person1+' and '+person2]=similarity(data,person1,person2)

    #dict sorted by value of key
    return sorted(results.items(),key=lambda d:d[1],reverse=True)          

In [7]:
results = myTopMatches(critics,sim_pearson)
pprint(results)

[('Claudia Puig and Michael Phillips', 1.0),
 ('Toby and Lisa Rose', 0.9912407071619299),
 ('Jack Matthews and Gene Seymour', 0.963795681875635),
 ('Toby and Mick LaSalle', 0.9244734516419049),
 ('Toby and Claudia Puig', 0.8934051474415647),
 ('Jack Matthews and Lisa Rose', 0.7470178808339965),
 ('Toby and Jack Matthews', 0.66284898035987),
 ('Mick LaSalle and Lisa Rose', 0.5940885257860044),
 ('Claudia Puig and Mick LaSalle', 0.5669467095138411),
 ('Claudia Puig and Lisa Rose', 0.5669467095138396),
 ('Mick LaSalle and Gene Seymour', 0.41176470588235276),
 ('Lisa Rose and Michael Phillips', 0.40451991747794525),
 ('Lisa Rose and Gene Seymour', 0.39605901719066977),
 ('Toby and Gene Seymour', 0.38124642583151164),
 ('Claudia Puig and Gene Seymour', 0.31497039417435607),
 ('Mick LaSalle and Jack Matthews', 0.21128856368212925),
 ('Gene Seymour and Michael Phillips', 0.20459830184114206),
 ('Jack Matthews and Michael Phillips', 0.13483997249264842),
 ('Claudia Puig and Jack Matthews', 0.0

In [8]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)

    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:
  
      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        totals.setdefault(item,0)
        totals[item]+=prefs[other][item]*sim
        # Sum of similarities
        simSums.setdefault(item,0)
        simSums[item]+=sim

  # Create the normalized list
  rankings=[(item,total/simSums[item]) for item,total in totals.items()]

  # Return the sorted list
  rankings.sort()
  rankings.reverse()
  return rankings

In [9]:
getRecommendations(critics,'Toby')

[('The Night Listener', 3.3477895267131013),
 ('Lady in the Water', 2.8325499182641614),
 ('Just My Luck', 2.5309807037655645)]

In [10]:
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      result.setdefault(item,{})
      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result

In [11]:
items = transformPrefs(critics)
pprint(items)
pprint(myTopMatches(items))
pprint(getRecommendations(items,'Just My Luck'))

{'Just My Luck': {'Claudia Puig': 3.0,
                  'Gene Seymour': 1.5,
                  'Lisa Rose': 3.0,
                  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
                       'Jack Matthews': 3.0,
                       'Lisa Rose': 2.5,
                       'Michael Phillips': 2.5,
                       'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
                       'Gene Seymour': 3.5,
                       'Jack Matthews': 4.0,
                       'Lisa Rose': 3.5,
                       'Michael Phillips': 3.0,
                       'Mick LaSalle': 4.0,
                       'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
                      'Gene Seymour': 5.0,
                      'Jack Matthews': 5.0,
                      'Lisa Rose': 3.5,
                      'Michael Phillips': 3.5,
                      'Mick LaSalle': 3.0,
                      'Toby': 4.0},
 'The Night Listener': {

#### https://github.com/pcao10/pydelicious/issues/43

In [12]:
from pydelicious import get_popular,get_userposts,get_urlposts
import time

def initializeUserDict(tag,count=5):
  user_dict={}
  # get the top count' popular posts
  for p1 in get_popular(tag=tag)[0:count]:
    # find all users who posted this
    for p2 in get_urlposts(p1['href']):
      user=p2['user']
      user_dict[user]={}
  return user_dict

def fillItems(user_dict):
  all_items={}
  # Find links posted by all users
  for user in user_dict:
    for i in range(3):
      try:
        posts=get_userposts(user)
        break
      except:
        print ("Failed user "+user+", retrying")
        time.sleep(4)
    for post in posts:
      url=post['href']
      user_dict[user][url]=1.0
      all_items[url]=1
  
  # Fill in missing items with 0
  for ratings in user_dict.values():
    for item in all_items:
      if item not in ratings:
        ratings[item]=0.0

In [13]:
# delusers = initializeUserDict('programming')
# delusers['tsegaran']={}
# fillItems(delusers)

In [14]:
# get_popular(tag='programming')[:10]

## 基于物品的过滤

In [15]:
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they
  # are most similar to.
  result={}
  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: 
        print ("%d / %d" % (c,len(itemPrefs)))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result

In [16]:
itemsim=calculateSimilarItems(critics)
pprint(itemsim)

{'Just My Luck': [(0.2222222222222222, 'Lady in the Water'),
                  (0.18181818181818182, 'You, Me and Dupree'),
                  (0.15384615384615385, 'The Night Listener'),
                  (0.10526315789473684, 'Snakes on a Plane'),
                  (0.06451612903225806, 'Superman Returns')],
 'Lady in the Water': [(0.4, 'You, Me and Dupree'),
                       (0.2857142857142857, 'The Night Listener'),
                       (0.2222222222222222, 'Snakes on a Plane'),
                       (0.2222222222222222, 'Just My Luck'),
                       (0.09090909090909091, 'Superman Returns')],
 'Snakes on a Plane': [(0.2222222222222222, 'Lady in the Water'),
                       (0.18181818181818182, 'The Night Listener'),
                       (0.16666666666666666, 'Superman Returns'),
                       (0.10526315789473684, 'Just My Luck'),
                       (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, '

In [17]:
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):

    # Loop over items similar to this one
    for (similarity,item2) in itemMatch[item]:

      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      scores.setdefault(item2,0)
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim.setdefault(item2,0)
      totalSim[item2]+=similarity

  # Divide each total score by total weighting to get an average
  rankings=[(score/totalSim[item],item) for item,score in scores.items( )]

  # Return the rankings from highest to lowest
  rankings.sort( )
  rankings.reverse( )
  return rankings

In [18]:
getRecommendedItems(critics,itemsim,'Toby')

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

In [19]:
critics

{'Claudia Puig': {'Just My Luck': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 4.0,
  'The Night Listener': 4.5,
  'You, Me and Dupree': 2.5},
 'Gene Seymour': {'Just My Luck': 1.5,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Jack Matthews': {'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 5.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 3.5},
 'Lisa Rose': {'Just My Luck': 3.0,
  'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.5,
  'Superman Returns': 3.5,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.5},
 'Michael Phillips': {'Lady in the Water': 2.5,
  'Snakes on a Plane': 3.0,
  'Superman Returns': 3.5,
  'The Night Listener': 4.0},
 'Mick LaSalle': {'Just My Luck': 2.0,
  'Lady in the Water': 3.0,
  'Snakes on a Plane': 4.0,
  'Superman Returns': 3.0,
  'The Night Listener': 3.0,
  'You, Me and Dupree': 2.0},
 'Toby': {'Snak

## 使用MovieLens数据集

In [20]:
def loadMovieLens(path='data/movielens'):
  # Get movie titles
  movies={}
  # ingnore
  for line in open(path+'/u.item',errors='ignore'):
    (id,title)=line.split('|')[0:2]
    movies[id]=title
  
  # Load data
  prefs={}
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs.setdefault(user,{})
    prefs[user][movies[movieid]]=float(rating)
  return prefs

In [21]:
prefs=loadMovieLens()
getRecommendations(prefs,'87')

[('unknown', 3.541386439780948),
 ('Zeus and Roxanne (1997)', 2.206510468856659),
 ("Young Poisoner's Handbook, The (1995)", 3.3819077059083824),
 ('You So Crazy (1994)', 3.0000000000000004),
 ('Year of the Horse (1997)', 2.0052819344400112),
 ('Yankee Zulu (1994)', 1.0),
 ('Wrong Trousers, The (1993)', 4.463287461290221),
 ('World of Apu, The (Apur Sansar) (1959)', 4.177871176381616),
 ("Wooden Man's Bride, The (Wu Kui) (1994)", 2.25602297941656),
 ('Wonderland (1997)', 3.3117604632609665),
 ('Wonderful, Horrible Life of Leni Riefenstahl, The (1993)',
  4.14590252004904),
 ('Women, The (1939)', 3.6135928801753763),
 ('Woman in Question, The (1950)', 1.0),
 ('Wolf (1994)', 2.7204784550506167),
 ('Witness (1985)', 4.0),
 ('Withnail and I (1987)', 3.542764908918436),
 ('With Honors (1994)', 3.0330851077633376),
 ('Wishmaster (1997)', 2.35572879338577),
 ('Winter Guest, The (1997)', 2.855705393098161),
 ('Winnie the Pooh and the Blustery Day (1968)', 3.9281109691491984),
 ('Wings of the D

In [22]:
itemsim=calculateSimilarItems(prefs,n=50)
getRecommendedItems(prefs,itemsim,'87')[:30]

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


[(5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, 'Vertigo (1958)'),
 (5.0, 'Usual Suspects, The (1995)'),
 (5.0, 'Toy Story (1995)'),
 (5.0, 'Titanic (1997)'),
 (5.0, 'Sword in the Stone, The (1963)'),
 (5.0, 'Stand by Me (1986)'),
 (5.0, 'Sling Blade (1996)'),
 (5.0, 'Silence of the Lambs, The (1991)'),
 (5.0, 'Shining, The (1980)'),
 (5.0, 'Shine (1996)'),
 (5.0, 'Sense and Sensibility (1995)'),
 (5.0, 'Scream (1996)'),
 (5.0, 'Rumble in the Bronx (1995)'),
 (5.0, 'Rock, The (1996)'),
 (5.0, 'Robin Hood: Prince of Thieves (1991)'),
 (5.0, 'Reservoir Dogs (1992)'),
 (5.0, 'Police Story 4: Project S (Chao ji ji hua) (1993)'),
 (5.0, 'House of the Spirits, The (1993)'),
 (5.0, 'Fresh (1994)'),
 (5.0, 'Day the Sun Turned Cold, The (Tianguo niezi) (1994)'),
 (5.0, 'Before the Rain (Pred dozhdot) (1994)'),
 (5.0, 'Assignment, The (1997)'),
 (5.0, '1-900 (1994)'),
 (4.888888888888889, "Ed's Next Move (1996)"),
 (4.833333333333333, 'Anna (1996)'),
 (4.8, 'Dark City (1998)'),
 (4.77777777