# 提供推荐

## 基于用户的协作型过滤
核心思想：找出和自己品味相近的一群人

In [34]:
from pprint import pprint
# A dictionary of movie critics and their ratings of a small
# set of movies
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
 'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5, 
 'The Night Listener': 3.0},
'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5, 
 'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0, 
 'You, Me and Dupree': 3.5}, 
'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
 'Superman Returns': 3.5, 'The Night Listener': 4.0},
'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
 'The Night Listener': 4.5, 'Superman Returns': 4.0, 
 'You, Me and Dupree': 2.5},
'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0, 
 'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.0}, 
'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
 'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}
pprint(critics)

{'Claudia Puig': {'Just My Luck': 3.0,
                  'Snakes on a Plane': 3.5,
                  'Superman Returns': 4.0,
                  'The Night Listener': 4.5,
                  'You, Me and Dupree': 2.5},
 'Gene Seymour': {'Just My Luck': 1.5,
                  'Lady in the Water': 3.0,
                  'Snakes on a Plane': 3.5,
                  'Superman Returns': 5.0,
                  'The Night Listener': 3.0,
                  'You, Me and Dupree': 3.5},
 'Jack Matthews': {'Lady in the Water': 3.0,
                   'Snakes on a Plane': 4.0,
                   'Superman Returns': 5.0,
                   'The Night Listener': 3.0,
                   'You, Me and Dupree': 3.5},
 'Lisa Rose': {'Just My Luck': 3.0,
               'Lady in the Water': 2.5,
               'Snakes on a Plane': 3.5,
               'Superman Returns': 3.5,
               'The Night Listener': 3.0,
               'You, Me and Dupree': 2.5},
 'Michael Phillips': {'Lady in the Water': 2.5,
    

In [11]:
from math import sqrt

# Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
  # Get the list of shared_items
  si={}
  for item in prefs[person1]: 
    if item in prefs[person2]: si[item]=1

  # if they have no ratings in common, return 0
  if len(si)==0: return 0

  # Add up the squares of all the differences
  sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2) 
                      for item in prefs[person1] if item in prefs[person2]])

  return 1/(1+sum_of_squares)

In [12]:
sim_distance(critics,'Lisa Rose','Gene Seymour')
names = list(critics.keys())
for person1 in names:
    for person2 in names:
        if person1 == person2:
            break
        distance = sim_distance(critics,person1,person2)
        print('%f : %s and %s.' %(distance,person1,person2))

0.129032 : Mick LaSalle and Gene Seymour.
0.133333 : Claudia Puig and Gene Seymour.
0.173913 : Claudia Puig and Mick LaSalle.
0.148148 : Lisa Rose and Gene Seymour.
0.333333 : Lisa Rose and Mick LaSalle.
0.285714 : Lisa Rose and Claudia Puig.
0.210526 : Michael Phillips and Gene Seymour.
0.285714 : Michael Phillips and Mick LaSalle.
0.571429 : Michael Phillips and Claudia Puig.
0.444444 : Michael Phillips and Lisa Rose.
0.800000 : Jack Matthews and Gene Seymour.
0.137931 : Jack Matthews and Mick LaSalle.
0.181818 : Jack Matthews and Claudia Puig.
0.210526 : Jack Matthews and Lisa Rose.
0.181818 : Jack Matthews and Michael Phillips.
0.108108 : Toby and Gene Seymour.
0.307692 : Toby and Mick LaSalle.
0.235294 : Toby and Claudia Puig.
0.222222 : Toby and Lisa Rose.
0.285714 : Toby and Michael Phillips.
0.117647 : Toby and Jack Matthews.


In [13]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
  # Get the list of mutually rated items
  si={}
  for item in prefs[p1]: 
    if item in prefs[p2]: si[item]=1

  # if they are no ratings in common, return 0
  if len(si)==0: return 0

  # Sum calculations
  n=len(si)
  
  # Sums of all the preferences
  sum1=sum([prefs[p1][it] for it in si])
  sum2=sum([prefs[p2][it] for it in si])
  
  # Sums of the squares
  sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
  sum2Sq=sum([pow(prefs[p2][it],2) for it in si])	
  
  # Sum of the products
  pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
  
  # Calculate r (Pearson score)
  num=pSum-(sum1*sum2/n)
  den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
  if den==0: return 0

  r=num/den

  return r

In [14]:
for person1 in names:
    for person2 in names:
        if person1 == person2:
            break
        distance = sim_pearson(critics,person1,person2)
        print('%f : %s and %s.' %(distance,person1,person2))

0.411765 : Mick LaSalle and Gene Seymour.
0.314970 : Claudia Puig and Gene Seymour.
0.566947 : Claudia Puig and Mick LaSalle.
0.396059 : Lisa Rose and Gene Seymour.
0.594089 : Lisa Rose and Mick LaSalle.
0.566947 : Lisa Rose and Claudia Puig.
0.204598 : Michael Phillips and Gene Seymour.
-0.258199 : Michael Phillips and Mick LaSalle.
1.000000 : Michael Phillips and Claudia Puig.
0.404520 : Michael Phillips and Lisa Rose.
0.963796 : Jack Matthews and Gene Seymour.
0.211289 : Jack Matthews and Mick LaSalle.
0.028571 : Jack Matthews and Claudia Puig.
0.747018 : Jack Matthews and Lisa Rose.
0.134840 : Jack Matthews and Michael Phillips.
0.381246 : Toby and Gene Seymour.
0.924473 : Toby and Mick LaSalle.
0.893405 : Toby and Claudia Puig.
0.991241 : Toby and Lisa Rose.
-1.000000 : Toby and Michael Phillips.
0.662849 : Toby and Jack Matthews.


In [15]:
# Returns the best matches for person from the prefs dictionary. 
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
  scores=[(similarity(prefs,person,other),other) 
                  for other in prefs if other!=person]
  scores.sort()
  scores.reverse()
  return scores[0:n]

# My own funtion
def myTopMatches(data,similarity=sim_pearson):
    names = list(data.keys())
    results = {}
    
    for person1 in names:
        for person2 in names:
            if person1 == person2:
                break
            results[person1+' and '+person2]=similarity(data,person1,person2)

    #dict sorted by value of key
    return sorted(results.items(),key=lambda d:d[1],reverse=True)          

In [16]:
results = myTopMatches(critics,sim_pearson)
pprint(results)

[('Michael Phillips and Claudia Puig', 1.0),
 ('Toby and Lisa Rose', 0.9912407071619299),
 ('Jack Matthews and Gene Seymour', 0.963795681875635),
 ('Toby and Mick LaSalle', 0.9244734516419049),
 ('Toby and Claudia Puig', 0.8934051474415647),
 ('Jack Matthews and Lisa Rose', 0.7470178808339965),
 ('Toby and Jack Matthews', 0.66284898035987),
 ('Lisa Rose and Mick LaSalle', 0.5940885257860044),
 ('Claudia Puig and Mick LaSalle', 0.5669467095138411),
 ('Lisa Rose and Claudia Puig', 0.5669467095138396),
 ('Mick LaSalle and Gene Seymour', 0.41176470588235276),
 ('Michael Phillips and Lisa Rose', 0.40451991747794525),
 ('Lisa Rose and Gene Seymour', 0.39605901719066977),
 ('Toby and Gene Seymour', 0.38124642583151164),
 ('Claudia Puig and Gene Seymour', 0.31497039417435607),
 ('Jack Matthews and Mick LaSalle', 0.21128856368212925),
 ('Michael Phillips and Gene Seymour', 0.20459830184114206),
 ('Jack Matthews and Michael Phillips', 0.13483997249264842),
 ('Jack Matthews and Claudia Puig', 0.0

In [17]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
  totals={}
  simSums={}
  for other in prefs:
    # don't compare me to myself
    if other==person: continue
    sim=similarity(prefs,person,other)

    # ignore scores of zero or lower
    if sim<=0: continue
    for item in prefs[other]:
  
      # only score movies I haven't seen yet
      if item not in prefs[person] or prefs[person][item]==0:
        # Similarity * Score
        totals.setdefault(item,0)
        totals[item]+=prefs[other][item]*sim
        # Sum of similarities
        simSums.setdefault(item,0)
        simSums[item]+=sim

  # Create the normalized list
  rankings=[(item,total/simSums[item]) for item,total in totals.items()]

  # Return the sorted list
  rankings.sort()
  rankings.reverse()
  return rankings

In [18]:
getRecommendations(critics,'Toby')

[('The Night Listener', 3.3477895267131013),
 ('Lady in the Water', 2.832549918264162),
 ('Just My Luck', 2.5309807037655645)]

In [19]:
def transformPrefs(prefs):
  result={}
  for person in prefs:
    for item in prefs[person]:
      result.setdefault(item,{})
      
      # Flip item and person
      result[item][person]=prefs[person][item]
  return result

In [35]:
items = transformPrefs(critics)
pprint(items)
pprint(myTopMatches(items))
pprint(getRecommendations(items,'Just My Luck'))

{'Just My Luck': {'Claudia Puig': 3.0,
                  'Gene Seymour': 1.5,
                  'Lisa Rose': 3.0,
                  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
                       'Jack Matthews': 3.0,
                       'Lisa Rose': 2.5,
                       'Michael Phillips': 2.5,
                       'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
                       'Gene Seymour': 3.5,
                       'Jack Matthews': 4.0,
                       'Lisa Rose': 3.5,
                       'Michael Phillips': 3.0,
                       'Mick LaSalle': 4.0,
                       'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
                      'Gene Seymour': 5.0,
                      'Jack Matthews': 5.0,
                      'Lisa Rose': 3.5,
                      'Michael Phillips': 3.5,
                      'Mick LaSalle': 3.0,
                      'Toby': 4.0},
 'The Night Listener': {

#### https://github.com/pcao10/pydelicious/issues/43

In [36]:
from pydelicious import get_popular,get_userposts,get_urlposts
import time

def initializeUserDict(tag,count=5):
  user_dict={}
  # get the top count' popular posts
  for p1 in get_popular(tag=tag)[0:count]:
    # find all users who posted this
    for p2 in get_urlposts(p1['href']):
      user=p2['user']
      user_dict[user]={}
  return user_dict

def fillItems(user_dict):
  all_items={}
  # Find links posted by all users
  for user in user_dict:
    for i in range(3):
      try:
        posts=get_userposts(user)
        break
      except:
        print ("Failed user "+user+", retrying")
        time.sleep(4)
    for post in posts:
      url=post['href']
      user_dict[user][url]=1.0
      all_items[url]=1
  
  # Fill in missing items with 0
  for ratings in user_dict.values():
    for item in all_items:
      if item not in ratings:
        ratings[item]=0.0

In [37]:
# delusers = initializeUserDict('programming')
# delusers['tsegaran']={}
# fillItems(delusers)

In [38]:
#get_popular(tag='programming')[:10]

## 基于物品的过滤

In [39]:
def calculateSimilarItems(prefs,n=10):
  # Create a dictionary of items showing which other items they
  # are most similar to.
  result={}
  # Invert the preference matrix to be item-centric
  itemPrefs=transformPrefs(prefs)
  c=0
  for item in itemPrefs:
    # Status updates for large datasets
    c+=1
    if c%100==0: 
        print ("%d / %d" % (c,len(itemPrefs)))
    # Find the most similar items to this one
    scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
    result[item]=scores
  return result

In [40]:
itemsim=calculateSimilarItems(critics)
pprint(itemsim)

{'Just My Luck': [(0.2222222222222222, 'Lady in the Water'),
                  (0.18181818181818182, 'You, Me and Dupree'),
                  (0.15384615384615385, 'The Night Listener'),
                  (0.10526315789473684, 'Snakes on a Plane'),
                  (0.06451612903225806, 'Superman Returns')],
 'Lady in the Water': [(0.4, 'You, Me and Dupree'),
                       (0.2857142857142857, 'The Night Listener'),
                       (0.2222222222222222, 'Snakes on a Plane'),
                       (0.2222222222222222, 'Just My Luck'),
                       (0.09090909090909091, 'Superman Returns')],
 'Snakes on a Plane': [(0.2222222222222222, 'Lady in the Water'),
                       (0.18181818181818182, 'The Night Listener'),
                       (0.16666666666666666, 'Superman Returns'),
                       (0.10526315789473684, 'Just My Luck'),
                       (0.05128205128205128, 'You, Me and Dupree')],
 'Superman Returns': [(0.16666666666666666, '

In [41]:
def getRecommendedItems(prefs,itemMatch,user):
  userRatings=prefs[user]
  scores={}
  totalSim={}
  # Loop over items rated by this user
  for (item,rating) in userRatings.items( ):

    # Loop over items similar to this one
    for (similarity,item2) in itemMatch[item]:

      # Ignore if this user has already rated this item
      if item2 in userRatings: continue
      # Weighted sum of rating times similarity
      scores.setdefault(item2,0)
      scores[item2]+=similarity*rating
      # Sum of all the similarities
      totalSim.setdefault(item2,0)
      totalSim[item2]+=similarity

  # Divide each total score by total weighting to get an average
  rankings=[(score/totalSim[item],item) for item,score in scores.items( )]

  # Return the rankings from highest to lowest
  rankings.sort( )
  rankings.reverse( )
  return rankings

In [42]:
getRecommendedItems(critics,itemsim,'Toby')

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

In [32]:
critics

{'Just My Luck': {'Claudia Puig': 3.0,
  'Gene Seymour': 1.5,
  'Lisa Rose': 3.0,
  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 2.5,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 4.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.0,
  'Mick LaSalle': 4.0,
  'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
  'Gene Seymour': 5.0,
  'Jack Matthews': 5.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.5,
  'Mick LaSalle': 3.0,
  'Toby': 4.0},
 'The Night Listener': {'Claudia Puig': 4.5,
  'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 3.0,
  'Michael Phillips': 4.0,
  'Mick LaSalle': 3.0},
 'You, Me and Dupree': {'Claudia Puig': 2.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 3.5,
  'Lisa Rose': 2.5,
  'Mick LaSalle': 2.0,
  'Toby': 1.0}}

## 使用MovieLens数据集

In [53]:
def loadMovieLens(path='data/movielens'):
  # Get movie titles
  movies={}
#   for line in open(path+'/u.item'):
#     (id,title)=line.split('|')[0:2]
#     movies[id]=title
  
  # Load data
  prefs={}
  for line in open(path+'/u.data'):
    (user,movieid,rating,ts)=line.split('\t')
    prefs.setdefault(user,{})
    prefs[user][movies[movieid]]=float(rating)
  return prefs

In [54]:
prefs=loadMovieLens()

KeyError: '242'