# project 1: recommendation


In [12]:
from math import sqrt
# A dictionary of movie critics and their ratings of a small set of movies
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'Superman Returns': 3.5,
        'You, Me and Dupree': 2.5,
        'The Night Listener': 3.0,
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 3.5,
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0,
        'Superman Returns': 3.5,
        'The Night Listener': 4.0,
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5,
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'Just My Luck': 2.0,
        'Superman Returns': 3.0,
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0,
    },
    'Jack Matthews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0,
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
    },
    'Toby': {'Snakes on a Plane': 4.5, 'You, Me and Dupree': 1.0,
             'Superman Returns': 4.0},
}



In [2]:
def sim_distance(prefs, p1, p2):
    '''returns a distance-based similarity score for person1 and person2. '''
    # the prefs is a dictionary of movie critics.
    si = {}
    # get the list of shared items. if they have no ratings in common, return 0
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
   # print(si)
    if len(si) == 0:
        return 0
    # calculate distance    
    d = sum( pow(prefs[p1][item]-prefs[p2][item], 2) for item in si )  
    return 1/(1+sqrt(d))

#sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

def sim_pearson(prefs, p1, p2):
    ''' returns the Pearson correlation coefficient for pearson1 and pearson2 '''
    si = {}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item] = 1
    n = len(si)
    if n == 0:
        return 0
    sum1 = sum([prefs[p1][item] for item in si])
    sum2 = sum([prefs[p2][item] for item in si])
    sum1Sq = sum([pow(prefs[p1][item], 2) for item in si])
    sum2Sq = sum([pow(prefs[p2][item], 2) for item in si])
    # sum of the products
    pSum = sum([prefs[p1][item] * prefs[p2][item] for item in si])
    # calculate pearson score r
    num = n * pSum - sum1 * sum2
    den = sqrt((n * sum1Sq - pow(sum1, 2)) * (n * sum2Sq - pow(sum2, 2)))
    if den == 0:
        return 0 
    r = num / den
    return r

#sim_pearson(critics, 'Lisa Rose', 'Toby')    
    

In [3]:
from operator import itemgetter
def topMatches(prefs, p1, n = 5, similarity = sim_pearson):
    ''' returns the top5 matches for person1 from the prefs dictionary '''
    # score format: ('Toby', 0.95)
    scores = [(p2, similarity(prefs, p1, p2)) for p2 in prefs if p2 != p1] # must have '[]' since it is a list!!
    scores = sorted(scores, key = itemgetter(1), reverse = True)
    #scores = sorted(scores, reverse = True, key = lambda x: x[1])
    return scores[0:n]

#topMatches(critics, 'Toby')

def getRecommendations(prefs, p1, similarity = sim_pearson):
    # returns recommendations for a person1 by using a weighted average of each other user's rankings
    totals = {} # {item1: total1, item2: total2 ...}
    simSums = {} # {item1: simSum1, item2: simSum2...}
    for p2 in prefs:
        if p2 == p1:
            continue
        sim = similarity(prefs, p1, p2)
        if sim <= 0:
            continue
        for item in prefs[p2]:
            #  only score movies I have not seen yet
            if item not in prefs[p1]:
                # weighted score = similarity * score
                totals.setdefault(item, 0) # if key does not exist in the dictionary, set default valaue, else returns its value
                totals[item] += prefs[p2][item]*sim
                # sum of similarities
                simSums.setdefault(item, 0)
                simSums[item] += sim
        
    # create the normalized list, format: (item, rank)
    #ranks = [(item, totals[item]/simSums[item]) for item in totals]
    ranks = [(item, total/simSums[item]) for item, total in totals.items()]
    ranks = sorted(ranks, key = itemgetter(1), reverse = True)
    return ranks

#getRecommendations(critics, 'Toby')    
    

# Transform the system from person-based to item-based

In [4]:
def transformPrefs(prefs):
    ''' Transform the recommendations into a mapping where persons are described
    with interest scores for a given title e.g. {title: person} instead of
    {person: title}. '''    
    result = {}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item, {})
            result[item][person] = prefs[person][item]
    return result
   
#transformPrefs(critics)        


In [5]:
def similarItems(prefs, n = 10):
    ''' create a dictionary of items showing what other items they are most similar to  '''
    result = {}
    # invert the person-based prefence dictionary to item-based
    itemPrefs = transformPrefs(prefs)
    c = 0
    for item in itemPrefs:
        # status updates for large datasets
        c += 1
        if c % 100 == 0:
            print('%d / %d' %(c, len(itemPrefs)))
        # find the most silimar items
        scores = topMatches(itemPrefs, item, n = n, similarity = sim_distance)  
        result[item] = scores
    return result

simI = similarItems(critics)

In [6]:
def getRecommendedItems(prefs, simItems, person):
    # the simItems is the ditionary of similar items obtained from the function similarItems
    ratings = prefs[person]
    totals = {}
    simSums = {}
    # loop over items rated by this person
    for (item, rating) in ratings.items():
        # loop over items similar to this one
        for (item2, simValue) in simItems[item]:
            # only score movie I have not watched
            if item2 not in ratings:
                # weighted score = similarity * score
                totals.setdefault(item2, 0)
                totals[item2] += simValue * rating
                # sum of similarities
                simSums.setdefault(item2, 0)
                simSums[item2] += simValue    
     # create the normalized list, format: (item, rank)
    ranks = [(item, total/simSums[item]) for (item, total) in totals.items()]
    ranks = sorted(ranks, key = itemgetter(1), reverse = True)
    return ranks

getRecommendedItems(critics, simI, 'Toby' )

[('The Night Listener', 3.1667425234070894),
 ('Just My Luck', 2.9366294028444346),
 ('Lady in the Water', 2.868767392626467)]

In [8]:
import os
# path = os.getcwd()   # current directory
f1 = open(r'C:\Users\Administrator\Desktop\python jeff shen\projects\01-recomments\movielens\u.item')
f2 = open(r'C:\Users\Administrator\Desktop\python jeff shen\projects\01-recomments\movielens\u.data')
                     
def loadMovieLens():
    # get movie titles
    movies = {}
    for line in f1:
        (id, title) = line.split('|')[0:2]
        movies[id] = title
    # get data
    prefs = {} # format: {user1: {movie1: ...,  movie2: ...,}, user2: {movie2: ...,  movie3: ...}}
    for line in f2:
          (userId, movieId, rating, time) = line.split('\t')
          prefs.setdefault(userId, {})
          title = movies[movieId]
          prefs[userId][title] = float(rating)
    return prefs
          
#t = loadMovieLens()
#t['87']


{'2001: A Space Odyssey (1968)': 5.0,
 'Ace Ventura: Pet Detective (1994)': 4.0,
 'Addams Family Values (1993)': 2.0,
 'Addicted to Love (1997)': 4.0,
 'Adventures of Priscilla, Queen of the Desert, The (1994)': 3.0,
 'Adventures of Robin Hood, The (1938)': 5.0,
 'Air Force One (1997)': 3.0,
 'Air Up There, The (1994)': 3.0,
 'Alien (1979)': 4.0,
 'American President, The (1995)': 5.0,
 'Annie Hall (1977)': 4.0,
 'Apocalypse Now (1979)': 4.0,
 'Babe (1995)': 5.0,
 'Baby-Sitters Club, The (1995)': 2.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 4.0,
 'Bananas (1971)': 5.0,
 'Barcelona (1994)': 3.0,
 'Batman & Robin (1997)': 4.0,
 'Batman (1989)': 3.0,
 'Batman Returns (1992)': 3.0,
 'Big Green, The (1995)': 3.0,
 'Big Squeeze, The (1996)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 4.0,
 'Blues Brothers, The (1980)': 5.0,
 'Boomerang (1992)': 3.0,
 'Boot, Das (1981)': 4.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Bridge on the River

SyntaxError: invalid syntax (<ipython-input-33-aa4f2f8dbabf>, line 2)