In [18]:
"""角色电影评分"""
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0, 
        'Superman Returns': 3.5, 
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0
    },

    'Gene Seymour': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5, 
        'Superman Returns': 5.0, 
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5
    },

    'Michael Phillips': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5, 
        'The Night Listener': 4.0
    },

    'Claudia Puig': {
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 3.0,
        'The Night Listener': 4.5, 
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5
    },

    'Mick LaSalle': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0, 
        'Superman Returns': 3.0, 
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0
    },
    
    'Jack Matthews': {
        'Lady in the Water': 3.0, 
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0, 
        'Superman Returns': 5.0, 
        'You, Me and Dupree': 3.5
    },

    'Toby': {
        'Snakes on a Plane': 4.5, 
        'You, Me and Dupree': 1.0, 
        'Superman Returns': 4.0
    }
}

In [19]:
def sim_distance(person1, person2, prefs=critics):
    """欧几里得距离评价"""
    from math import sqrt
    si = [] # 交集
    for item in prefs[person1]:
        if item in prefs[person2]:
            si.append(item) # 加入交集

    if len(si) == 0: return 0 # 交集为空则相似度为0

    # 计算欧几里得距离的平方
    sum_of_squares = sum(
        [ pow(prefs[person1][item] - prefs[person2][item], 2) for item in prefs[person1] if item in prefs[person2] ]
    )

    # 返回一个0-1保留两位小数的数字，数字越大代表相似度越高
    return 1/(1+sqrt(sum_of_squares))

def sim_pearson(person1, person2, prefs=critics):
    """皮尔斯相关度评价"""
    from math import sqrt

    si = [] # 交集
    for item in prefs[person1]:
        if item in prefs[person2]:
            si.append(item) # 加入交集

    if len(si)==0: return 0
    n=len(si)

    # 评分求和
    sum1=sum([prefs[person1][it] for it in si])
    sum2=sum([prefs[person2][it] for it in si])
    
    # 评分平方和
    sum1Sq=sum([pow(prefs[person1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[person2][it],2) for it in si])	
    
    # 评分乘积和
    pSum=sum([prefs[person1][it] * prefs[person2][it] for it in si])
    
    # 皮尔斯相关系数计算
    # 当相关系数为0时，X和Y两变量无关系。
    # 当X的值增大（减小），Y值增大（减小），两个变量为正相关，相关系数在0.00与1.00之间。
    # 当X的值增大（减小），Y值减小（增大），两个变量为负相关，相关系数在-1.00与0.00之间。
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0

    return num/den

In [20]:
def get_matches(person, simlarity=sim_pearson, n=5, prefs=critics):
    """person相关度前5(默认)的人"""
    scores = [(simlarity(person1=person, person2=other, prefs=prefs), other) for other in prefs if person != other]
    scores.sort()
    scores.reverse()
    return scores[0:n]


In [31]:
get_matches("Toby")

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig'),
 (0.66284898035987, 'Jack Matthews'),
 (0.38124642583151164, 'Gene Seymour')]

In [32]:
get_matches("Toby", simlarity=sim_distance)


[(0.4, 'Mick LaSalle'),
 (0.38742588672279304, 'Michael Phillips'),
 (0.3567891723253309, 'Claudia Puig'),
 (0.3483314773547883, 'Lisa Rose'),
 (0.2674788903885893, 'Jack Matthews')]

In [22]:
def get_recommendations(person, simlarity=sim_pearson, n=5, prefs=critics):
    """计算出评分的加权平均数给出推荐"""
    sim_dict = {}
    sim_scores_dict = {}

    # 遍历所有人
    for other in prefs:
        # 排除自己
        if other == person : continue

        # 计算相关度
        sim = simlarity(person1=person, person2=other, prefs=prefs)

        # 相关度必须大于0
        if sim <= 0 : continue

        for item in prefs[other]:
            # 只判断没评价过的电影
            if item in prefs[person] : continue
            
            # 置默认值
            sim_scores_dict.setdefault(item, 0)
            sim_dict.setdefault(item, 0)
            
            # 计算other的item评分*other相关度的累加(sum(分数*权))
            sim_scores_dict[item] += prefs[other][item] * sim
            # 计算总相关度：相关度累加(sum(权))
            sim_dict[item] += sim
    
    # 计算itme评分加权平均值：sum(分数*权)/sum(权)
    item_rank = [(sim_score/sim_dict[item], item) for item, sim_score in sim_scores_dict.items()]
    item_rank.sort()
    item_rank.reverse()

    return item_rank[0:n]


In [23]:
get_recommendations('Michael Phillips')

[(2.8092760065251268, 'Just My Luck'),
 (2.694636703980363, 'You, Me and Dupree')]

In [24]:
get_recommendations('Michael Phillips', simlarity=sim_distance)

[(2.4825817086405517, 'Just My Luck'),
 (2.453379230569188, 'You, Me and Dupree')]

In [25]:
def transform_prefs(prefs):
    transform_dict = {}
    for person in prefs:
        for item in prefs[person]:
            transform_dict.setdefault(item, {})
            transform_dict[item][person] = prefs[person][item]
    return transform_dict

In [26]:
movies = transform_prefs(critics)
movies

{'Lady in the Water': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.0,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0},
 'Snakes on a Plane': {'Lisa Rose': 3.5,
  'Gene Seymour': 3.5,
  'Michael Phillips': 3.0,
  'Claudia Puig': 3.5,
  'Mick LaSalle': 4.0,
  'Jack Matthews': 4.0,
  'Toby': 4.5},
 'Just My Luck': {'Lisa Rose': 3.0,
  'Gene Seymour': 1.5,
  'Claudia Puig': 3.0,
  'Mick LaSalle': 2.0},
 'Superman Returns': {'Lisa Rose': 3.5,
  'Gene Seymour': 5.0,
  'Michael Phillips': 3.5,
  'Claudia Puig': 4.0,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 5.0,
  'Toby': 4.0},
 'You, Me and Dupree': {'Lisa Rose': 2.5,
  'Gene Seymour': 3.5,
  'Claudia Puig': 2.5,
  'Mick LaSalle': 2.0,
  'Jack Matthews': 3.5,
  'Toby': 1.0},
 'The Night Listener': {'Lisa Rose': 3.0,
  'Gene Seymour': 3.0,
  'Michael Phillips': 4.0,
  'Claudia Puig': 4.5,
  'Mick LaSalle': 3.0,
  'Jack Matthews': 3.0}}

In [27]:
get_matches(person='Superman Returns', prefs=movies)

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [28]:
get_matches(person='Superman Returns', simlarity=sim_distance, prefs=movies)

[(0.3090169943749474, 'Snakes on a Plane'),
 (0.252650308587072, 'The Night Listener'),
 (0.2402530733520421, 'Lady in the Water'),
 (0.20799159651347807, 'Just My Luck'),
 (0.1918253663634734, 'You, Me and Dupree')]