In [1]:
critics={'Lisa Rose': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.5,
      'Just My Luck': 3.0, 'Superman Returns': 3.5, 'You, Me and Dupree': 2.5,
      'The Night Listener': 3.0},
     'Gene Seymour': {'Lady in the Water': 3.0, 'Snakes on a Plane': 3.5,
      'Just My Luck': 1.5, 'Superman Returns': 5.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 3.5},
     'Michael Phillips': {'Lady in the Water': 2.5, 'Snakes on a Plane': 3.0,
      'Superman Returns': 3.5, 'The Night Listener': 4.0},
     'Claudia Puig': {'Snakes on a Plane': 3.5, 'Just My Luck': 3.0,
      'The Night Listener': 4.5, 'Superman Returns': 4.0,
      'You, Me and Dupree': 2.5},
     'Mick LaSalle': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'Just My Luck': 2.0, 'Superman Returns': 3.0, 'The Night Listener': 3.0,
      'You, Me and Dupree': 2.0},
     'Jack Matthews': {'Lady in the Water': 3.0, 'Snakes on a Plane': 4.0,
      'The Night Listener': 3.0, 'Superman Returns': 5.0, 'You, Me and Dupree': 3.5},
     'Toby': {'Snakes on a Plane':4.5,'You, Me and Dupree':1.0,'Superman Returns':4.0}}

In [2]:
critics['Lisa Rose']

{'Just My Luck': 3.0,
 'Lady in the Water': 2.5,
 'Snakes on a Plane': 3.5,
 'Superman Returns': 3.5,
 'The Night Listener': 3.0,
 'You, Me and Dupree': 2.5}

In [3]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [4]:
from math import sqrt
sqrt(pow(5-4,2) + pow(4-1,2))

3.1622776601683795

In [13]:
1/(1+sqrt(((5-4)**2 + (4-1)**2)))

0.2402530733520421

In [14]:
from math import sqrt
    # Returns a distance-based similarity score for person1 and person2
def sim_distance(prefs,person1,person2): #Eucladian distance -> sum of squares
    # Get the list of shared_items
    si={}
    for item in prefs[person1]:
        if item in prefs[person2]:
            si[item]=1
        # if they have no ratings in common, return 0
    if len(si)==0: return 0
    # Add up the squares of all the differences
    sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)
                      for item in prefs[person1] if item in prefs[person2]])
    return 1/(1+sum_of_squares)

In [15]:
sim_distance(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [16]:
sim_distance(critics, 'Lisa Rose', 'Michael Phillips')

0.4444444444444444

In [17]:
# Returns the Pearson correlation coefficient for p1 and p2
def sim_pearson(prefs,p1,p2):
    # Get the list of mutually rated items
    si={}
    for item in prefs[p1]:
        if item in prefs[p2]: si[item]=1
    
    # Find the number of elements
    n=len(si)
    
    # if they are no ratings in common, return 0
    if n==0: return 0
    
    # Add up all the preferences
    sum1=sum([prefs[p1][it] for it in si])
    sum2=sum([prefs[p2][it] for it in si])
    
    # Sum up the squares
    sum1Sq=sum([pow(prefs[p1][it],2) for it in si])
    sum2Sq=sum([pow(prefs[p2][it],2) for it in si])
    
    # Sum up the products
    pSum=sum([prefs[p1][it]*prefs[p2][it] for it in si])
    
    # Calculate Pearson score
    num=pSum-(sum1*sum2/n)
    den=sqrt((sum1Sq-pow(sum1,2)/n)*(sum2Sq-pow(sum2,2)/n))
    if den==0: return 0
    r=num/den
    return r

In [18]:
sim_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [19]:
sim_distance(critics, 'Lisa Rose', 'Michael Phillips')

0.4444444444444444

In [20]:
# Returns the best matches for person from the prefs dictionary.
# Number of results and similarity function are optional params.
def topMatches(prefs,person,n=5,similarity=sim_pearson):
    scores=[(similarity(prefs,person,other),other)
                   for other in prefs if other!=person]
    # Sort the list so the highest scores appear at the top scores.sort( )
    scores.reverse( )
    return scores[0:n]

In [21]:
topMatches(critics,'Toby', n=3)

[(-1.0, 'Michael Phillips'),
 (0.38124642583151164, 'Gene Seymour'),
 (0.9912407071619299, 'Lisa Rose')]

In [27]:
topMatches(critics,'Toby', n=3)

[(-1.0, 'Michael Phillips'),
 (0.38124642583151164, 'Gene Seymour'),
 (0.9912407071619299, 'Lisa Rose')]

In [28]:
# Gets recommendations for a person by using a weighted average
# of every other user's rankings
def getRecommendations(prefs,person,similarity=sim_pearson):
    totals={}
    simSums={}
    for other in prefs:
        # don't compare me to myself
        if other==person: continue
        sim=similarity(prefs,person,other)
        
        # ignore scores of zero or lower
        if sim<=0: continue
        for item in prefs[other]:
            # only score movies I haven't seen yet
            if item not in prefs[person] or prefs[person][item]==0:
                # Similarity * Score 
                totals.setdefault(item,0) 
                totals[item]+=prefs[other][item]*sim 
                # Sum of similarities 
                simSums.setdefault(item,0) 
                simSums[item]+=sim
    # Create the normalized list
    rankings=[(total/simSums[item],item) for item,total in totals.items()]
    
    # Return the sorted list 
    rankings.sort( ) 
    rankings.reverse( ) 
    return rankings

In [29]:
getRecommendations(critics,'Toby')

[(3.3477895267131013, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.5309807037655645, 'Just My Luck')]

In [31]:
getRecommendations(critics,'Toby',
... similarity=sim_distance)

[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.461988486074374, 'Just My Luck')]

In [34]:
getRecommendations(critics,'Toby',\
                   similarity=sim_distance)

[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.461988486074374, 'Just My Luck')]

In [35]:
def transformPrefs(prefs):
    result={}
    for person in prefs:
        for item in prefs[person]:
            result.setdefault(item,{})
            # Flip item and person
            result[item][person]=prefs[person][item]
    return result

In [36]:
movies=transformPrefs(critics)

In [37]:
movies

{'Just My Luck': {'Claudia Puig': 3.0,
  'Gene Seymour': 1.5,
  'Lisa Rose': 3.0,
  'Mick LaSalle': 2.0},
 'Lady in the Water': {'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 2.5,
  'Michael Phillips': 2.5,
  'Mick LaSalle': 3.0},
 'Snakes on a Plane': {'Claudia Puig': 3.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 4.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.0,
  'Mick LaSalle': 4.0,
  'Toby': 4.5},
 'Superman Returns': {'Claudia Puig': 4.0,
  'Gene Seymour': 5.0,
  'Jack Matthews': 5.0,
  'Lisa Rose': 3.5,
  'Michael Phillips': 3.5,
  'Mick LaSalle': 3.0,
  'Toby': 4.0},
 'The Night Listener': {'Claudia Puig': 4.5,
  'Gene Seymour': 3.0,
  'Jack Matthews': 3.0,
  'Lisa Rose': 3.0,
  'Michael Phillips': 4.0,
  'Mick LaSalle': 3.0},
 'You, Me and Dupree': {'Claudia Puig': 2.5,
  'Gene Seymour': 3.5,
  'Jack Matthews': 3.5,
  'Lisa Rose': 2.5,
  'Mick LaSalle': 2.0,
  'Toby': 1.0}}

In [39]:
topMatches(movies,'Superman Returns')

[(0.6579516949597695, 'You, Me and Dupree'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (0.4879500364742689, 'Lady in the Water')]

The technique we have used thus far is called user-based collaborative filtering. 
An alternative is known as item-based collaborative filtering. 
In cases with very large datasets, item-based collaborative filtering can give better results, and it allows many of the calculations to be performed in advance so that a user needing recommenda- tions can get them more quickly.

In [40]:
def calculateSimilarItems(prefs,n=10):
    # Create a dictionary of items showing which other items they
    # are most similar to.
    result={}
    # Invert the preference matrix to be item-centric
    itemPrefs=transformPrefs(prefs)
    c=0
    for item in itemPrefs:
    # Status updates for large datasets
        c+=1
        if c%100==0: print "%d / %d" % (c,len(itemPrefs))
        # Find the most similar items to this one
        scores=topMatches(itemPrefs,item,n=n,similarity=sim_distance)
        result[item]=scores
    return result

In [41]:
itemsim=calculateSimilarItems(critics)
itemsim

{'Just My Luck': [(0.18181818181818182, 'You, Me and Dupree'),
  (0.15384615384615385, 'The Night Listener'),
  (0.06451612903225806, 'Superman Returns'),
  (0.10526315789473684, 'Snakes on a Plane'),
  (0.2222222222222222, 'Lady in the Water')],
 'Lady in the Water': [(0.4, 'You, Me and Dupree'),
  (0.2857142857142857, 'The Night Listener'),
  (0.09090909090909091, 'Superman Returns'),
  (0.2222222222222222, 'Just My Luck'),
  (0.2222222222222222, 'Snakes on a Plane')],
 'Snakes on a Plane': [(0.05128205128205128, 'You, Me and Dupree'),
  (0.18181818181818182, 'The Night Listener'),
  (0.16666666666666666, 'Superman Returns'),
  (0.10526315789473684, 'Just My Luck'),
  (0.2222222222222222, 'Lady in the Water')],
 'Superman Returns': [(0.05333333333333334, 'You, Me and Dupree'),
  (0.10256410256410256, 'The Night Listener'),
  (0.06451612903225806, 'Just My Luck'),
  (0.16666666666666666, 'Snakes on a Plane'),
  (0.09090909090909091, 'Lady in the Water')],
 'The Night Listener': [(0.14

In [42]:
def getRecommendedItems(prefs,itemMatch,user):
    userRatings=prefs[user]
    scores={}
    totalSim={}
    # Loop over items rated by this user
    for (item,rating) in userRatings.items():
        # Loop over items similar to this one
        for (similarity,item2) in itemMatch[item]:
            # Ignore if this user has already rated this item
            if item2 in userRatings: continue
            # Weighted sum of rating times similarity
            scores.setdefault(item2,0)
            scores[item2]+=similarity*rating
            # Sum of all the similarities
            totalSim.setdefault(item2,0)
            totalSim[item2]+=similarity
    # Divide each total score by total weighting to get an average 
    rankings=[(score/totalSim[item],item) for item,score in scores.items()]

    # Return the rankings from highest to lowest 
    rankings.sort( )
    rankings.reverse( )
    return rankings


In [43]:
getRecommendedItems(critics,itemsim,'Toby')

[(3.182634730538922, 'The Night Listener'),
 (2.5983318700614575, 'Just My Luck'),
 (2.4730878186968837, 'Lady in the Water')]

In [44]:

def loadMovieLens(path='/Users/jhonasttanregalado/Documents/DataScience/bootcamp7/week11/pyspark_lec_2/ml-100k'):
    # Get movie titles
    movies={}
    for line in open(path+'/u.item'):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [45]:
prefs=loadMovieLens()

In [47]:
prefs['87']

{'2001: A Space Odyssey (1968)': 5.0,
 'Ace Ventura: Pet Detective (1994)': 4.0,
 'Addams Family Values (1993)': 2.0,
 'Addicted to Love (1997)': 4.0,
 'Adventures of Priscilla, Queen of the Desert, The (1994)': 3.0,
 'Adventures of Robin Hood, The (1938)': 5.0,
 'Air Force One (1997)': 3.0,
 'Air Up There, The (1994)': 3.0,
 'Alien (1979)': 4.0,
 'American President, The (1995)': 5.0,
 'Annie Hall (1977)': 4.0,
 'Apocalypse Now (1979)': 4.0,
 'Babe (1995)': 5.0,
 'Baby-Sitters Club, The (1995)': 2.0,
 'Back to the Future (1985)': 5.0,
 'Bad Boys (1995)': 4.0,
 'Bananas (1971)': 5.0,
 'Barcelona (1994)': 3.0,
 'Batman & Robin (1997)': 4.0,
 'Batman (1989)': 3.0,
 'Batman Returns (1992)': 3.0,
 'Big Green, The (1995)': 3.0,
 'Big Squeeze, The (1996)': 2.0,
 'Birdcage, The (1996)': 4.0,
 'Blade Runner (1982)': 4.0,
 'Blues Brothers, The (1980)': 5.0,
 'Boomerang (1992)': 3.0,
 'Boot, Das (1981)': 4.0,
 'Brady Bunch Movie, The (1995)': 2.0,
 'Braveheart (1995)': 4.0,
 'Bridge on the River

In [48]:
getRecommendations(prefs,'87')[0:30]

[(5.0, 'They Made Me a Criminal (1939)'),
 (5.0, 'Star Kid (1997)'),
 (5.0, 'Santa with Muscles (1996)'),
 (5.0, 'Saint of Fort Washington, The (1993)'),
 (5.0, 'Marlene Dietrich: Shadow and Light (1996) '),
 (5.0, 'Great Day in Harlem, A (1994)'),
 (5.0, 'Entertaining Angels: The Dorothy Day Story (1996)'),
 (5.0, 'Boys, Les (1997)'),
 (4.89884443128923, 'Legal Deceit (1997)'),
 (4.815019082242709, 'Letter From Death Row, A (1998)'),
 (4.7321082983941425, 'Hearts and Minds (1996)'),
 (4.696244466490867, 'Pather Panchali (1955)'),
 (4.652397061026758, 'Lamerica (1994)'),
 (4.538723693474813, 'Leading Man, The (1996)'),
 (4.535081339106103, 'Mrs. Dalloway (1997)'),
 (4.532337612572981, 'Innocents, The (1961)'),
 (4.527998574747079, 'Casablanca (1942)'),
 (4.510270149719864, 'Everest (1998)'),
 (4.493967755428439, 'Dangerous Beauty (1998)'),
 (4.485151301801342, 'Wallace & Gromit: The Best of Aardman Animation (1996)'),
 (4.463287461290222, 'Wrong Trousers, The (1993)'),
 (4.450979436941

In [49]:
itemsim=calculateSimilarItems(prefs,n=50)

100 / 1664
200 / 1664
300 / 1664
400 / 1664
500 / 1664
600 / 1664
700 / 1664
800 / 1664
900 / 1664
1000 / 1664
1100 / 1664
1200 / 1664
1300 / 1664
1400 / 1664
1500 / 1664
1600 / 1664


In [53]:
getRecommendedItems(prefs,itemsim,'87')[0:30]

[(4.044228694714131, 'Normal Life (1996)'),
 (4.037494334925064, 'Get on the Bus (1996)'),
 (4.017067536699203, 'Hearts and Minds (1996)'),
 (3.959801730669257, 'Story of Xinghua, The (1993)'),
 (3.702236413125302, 'Lay of the Land, The (1997)'),
 (3.701587070471748, 'T-Men (1947)'),
 (3.6969158662806736, 'American Dream (1990)'),
 (3.6828664293452955, 'Power 98 (1995)'),
 (3.65609305461326, 'Fall (1997)'),
 (3.623309757231033, 'Gumby: The Movie (1995)'),
 (3.5956826211840056, 'For Ever Mozart (1996)'),
 (3.5719973746575597, 'White Balloon, The (1995)'),
 (3.5604856205573707, 'Graduate, The (1967)'),
 (3.5595238095238098, 'Ripe (1996)'),
 (3.5495417687655633, 'Bastard Out of Carolina (1996)'),
 (3.5466517167334732, 'Some Kind of Wonderful (1987)'),
 (3.536377309029032, 'Tom and Huck (1995)'),
 (3.516754817542454, 'Locusts, The (1997)'),
 (3.5139371527637513, 'Jaws 3-D (1983)'),
 (3.512018271794366, 'East of Eden (1955)'),
 (3.4991618387880368, 'Fled (1996)'),
 (3.484286343032365, 'Solo

In [54]:
from sklearn.cluster import KMeans
clustering_model = KMeans(n_clusters=5)