In [1]:
critics={'Lisa Rose':{'Lady in the water':2.5,'Snakes on a Plane':3.5,'Just My Luck':3.0, 'Superman Returns':3.5,
                      'You, Me and Dupree':2.5,'The Night Listener':3.0},
         'Gene Seymour':{'Lady in the water':3.0,'Snakes on a Plane':3.5,'Just My Luck':1.5, 'Superman Returns':5.0,
                      'You, Me and Dupree':3.5,'The Night Listener':3.0},
         'Michael Phillips':{'Lady in the water':2.5,'Snakes on a Plane':3.0,'Just My Luck':3.0, 'Superman Returns':3.5,
                      'The Night Listener':4.0},
         'Claudia Puig':{'Lady in the water':2.5,'Snakes on a Plane':3.0,'Just My Luck':3.5, 'Superman Returns':3.5,
                      'You, Me and Dupree':2.5,'The Night Listener':4.0},
         'Mick LaSalle':{'Lady in the water':3.0,'Snakes on a Plane':4.0,'Just My Luck':2.0, 'Superman Returns':4.0,
                      'You, Me and Dupree':2.0,'The Night Listener':4.5},
         'Jack Mathews':{'Lady in the water':3.0,'Snakes on a Plane':4.0,'Superman Returns':5.0,
                      'You, Me and Dupree':3.5,'The Night Listener':3.0},
         'Toby':{'Snakes on a Plane':4.5, 'Superman Returns':4.0,'You, Me and Dupree':1.0}
        }


In [2]:
critics['Toby']['Snakes on a plane']=4.5
critics['Toby']

{'Snakes on a Plane': 4.5,
 'Snakes on a plane': 4.5,
 'Superman Returns': 4.0,
 'You, Me and Dupree': 1.0}

In [3]:
from math import sqrt
#Return a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2):
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
    # if they have no ratings common, return 0
    if len(si)==0: return 0
    
    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
                       for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)
sim_distance(critics,'Lisa Rose','Gene Seymour')

0.14814814814814814

In [4]:
# Return the peorson Correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]:
            si[item]=1
    # If they have no ratings common, return 0
    n=len(si)
    if n==0: return 0
    
    # Add up all the preferences 
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    
    # Sum up the products 
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    # Calculate the score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0:return 0
    r=num/den
    return r
sim_pearson(critics,'Lisa Rose','Gene Seymour')

0.39605901719066977

In [5]:
# Return the best matches for person 
def TopMatch(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other) for other in prefs if other!=person ]
    scores.sort()
    scores.reverse()
    return scores[0:n]
TopMatch(critics,'Toby',n=3)

[(0.9912407071619305, 'Mick LaSalle'),
 (0.9912407071619299, 'Lisa Rose'),
 (0.7924058156930613, 'Claudia Puig')]

In [6]:
# Gets reccommendations for a person by using a weighted average of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        
        # Ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # Only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * score
                totals.setdefault(item,0)
                totals[item]+=prefs[other][item]*sim
                # Sum of similarities
                simSums.setdefault(item,0)
                simSums[item]+=sim
    # Create the normalized test
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    
    # Return the sorted list
    rankings.sort()
    rankings.reverse()
    return rankings

In [7]:
getRecommendations(critics,'Toby')

[(3.5968256715351132, 'The Night Listener'),
 (2.766476219878039, 'Lady in the water'),
 (2.6302731236047805, 'Just My Luck')]

In [8]:
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            
            #Flip item and person
            result[item][person]=prefs[person][item]
            
    return result

In [9]:
movies = transformPrefs(critics)
TopMatch(movies,'Superman Returns')

[(0.86824314212446, 'Lady in the water'),
 (0.6172133998483683, 'You, Me and Dupree'),
 (0.41358509593425646, 'Snakes on a Plane'),
 (0, 'Snakes on a plane'),
 (-0.5281619138643917, 'The Night Listener')]

In [10]:
#------------------------------ITEM BASED FILTERING-------------------------------#
def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they are most similar to.
    result={}
    # Invert the preferance matrix to be item-centric
    itemPref=transformPrefs(prefs)
    c=0
    for item in itemPref:
        # Status updates for large datasets
        c+=1
        if c%100==0: print "%d / %d" %(c,len(itemPref))
        # find the most similar items to this one
        scores=TopMatch(itemPref,item,n=n,similarity=sim_distance)
        result[item]=scores
    return result

In [11]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    
    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings:continue
            # Weighted sum of rating similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities 
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity

    # Divide each total score by total weighting to get an average
    rankings=[(score/totalSim[item],item) for item,score in scores.items()]
    
    # Return the ranking from highest to lowest
    rankings.sort()
    rankings.reverse()
    return rankings

In [12]:
def loadMovieLens(path='Data'):
    # Get movie titles 
    movies={}
    for line in open(path+'/u.item'):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
        
    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [13]:
prefs=loadMovieLens()
itemsim=calculateSimilarItems(prefs,n=50)
getRecommendedItems(prefs,itemsim,'86')[0:30]

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


[(5.0, "Young Poisoner's Handbook, The (1995)"),
 (5.0, 'Young Guns II (1990)'),
 (5.0, 'Young Guns (1988)'),
 (5.0, 'Yankee Zulu (1994)'),
 (5.0, 'Wyatt Earp (1994)'),
 (5.0, 'Wrong Trousers, The (1993)'),
 (5.0, 'Woman in Question, The (1950)'),
 (5.0, 'Wolf (1994)'),
 (5.0, 'Withnail and I (1987)'),
 (5.0, 'With Honors (1994)'),
 (5.0, 'Winter Guest, The (1997)'),
 (5.0, 'Winnie the Pooh and the Blustery Day (1968)'),
 (5.0, 'Wings of Desire (1987)'),
 (5.0, "William Shakespeare's Romeo and Juliet (1996)"),
 (5.0, 'Wild Bunch, The (1969)'),
 (5.0, "Widows' Peak (1994)"),
 (5.0, 'Whole Wide World, The (1996)'),
 (5.0, 'White Squall (1996)'),
 (5.0, 'When a Man Loves a Woman (1994)'),
 (5.0, "What's Love Got to Do with It (1993)"),
 (5.0, "What's Eating Gilbert Grape (1993)"),
 (5.0, "Wend Kuuni (God's Gift) (1982)"),
 (5.0, 'Waterworld (1995)'),
 (5.0, 'Walkabout (1971)'),
 (5.0, 'Walk in the Clouds, A (1995)'),
 (5.0, 'Virtuosity (1995)'),
 (5.0, 'Vie est belle, La (Life is Rosey) (

In [66]:
# def load_music(path='music'):
#     # Get artists 
#     artists, users = {}, {}
#     for line in open(path+'/artist_data.txt'):
#         try:
#             id, artist = line.split('\t')
#             artists[id.strip()] = artist.strip()
#         except:
#             pass
# #     for line in open(path+'/user_artist_data.txt'):
#     with open(path+'/user_artist_data.txt') as myfile:
#         head = [next(myfile) for x in xrange(200000)]
#     for line in head:
#         try:
#             user, artist, count = line.split(' ')
#             user = user.strip()
#             if user in users.keys():
# #                 print '1'
#                 if artist.strip() in artists:
#                     users[user] = users[user].update({artists[artist.strip()]: count.strip()}) 
# #                 print user, artist, count
#             else:
#                 users[user] = {}
#                 if artist.strip() in artists:
#                     users[user] = users[user].update({artists[artist.strip()]: count.strip()}) 
# #                 print '2'
#         except:
#             pass
#     return users, artists

In [67]:
# pref, artists = load_music()