# First steps following the book

In [1]:
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 3.0, 
        'Superman Returns': 3.5, 
        'You, Me and Dupree': 2.5, 
        'The Night Listener': 3.0,
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
        'The Night Listener': 3.0,
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0, 
        'Superman Returns': 3.5, 
        'The Night Listener': 4.0,
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5,
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0, 
        'Just My Luck': 2.0, 
        'Superman Returns': 3.0, 
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0,
    },
    'Jack Mattews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0, 
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
    },
    'Toby': {
        'Snakes on a Plane': 4.5,
        'You, Me and Dupree': 1.0,
        'Superman Returns': 4.0,
    }
}

In [2]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [3]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

## Finding similar users

TODO: draw a plot y = movie1, x = movie2 for all users



### Euclidean similarity distance

0 if nothing in common, 1 if equal

In [4]:
def get_shared_items(critics, person1, person2):
    shared_items = {}
    for item in critics[person1]:
        if item in critics[person2]:
            shared_items[item] = True
    return shared_items

In [5]:
def similarity_euclides(critics, person1, person2):
    shared_items = get_shared_items(critics, person1, person2)
    if len(shared_items) == 0:
        return 0
    
    sum_of_squares = sum(pow(critics[person1][item] - critics[person2][item], 2)
                        for item in shared_items.keys())
    
    return 1/(1 + sum_of_squares)

In [6]:
similarity_euclides(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [7]:
similarity_euclides(critics, 'Mick LaSalle', 'Mick LaSalle')

1.0

### Pearson similarity distance

First locates items that got critics on both. Then calculates sums and sums of squares, and sum of products. And use those sums to calculate Pearson correlation.


Returns values between -1 and 1. 

In [8]:
def similarity_pearson_book(critics, person1, person2):
    shared_items = get_shared_items(critics, person1, person2)
    
    elements_count = len(shared_items)
    if elements_count == 0:
        return 0
    
    sum_person1 = sum([critics[person1][item] for item in shared_items.keys()])
    sum_person2 = sum([critics[person2][item] for item in shared_items.keys()])
    
    sum_squares_person1 = sum([pow(critics[person1][item], 2) for item in shared_items.keys()])
    sum_squares_person2 = sum([pow(critics[person2][item], 2) for item in shared_items.keys()])
    
    sum_products = sum([critics[person1][item] * critics[person2][item] for item in shared_items.keys()])
    
    numerator = sum_products - (sum_person1 * sum_person2 / elements_count)
    denominator = pow((sum_squares_person1 - pow(sum_person1, 2) / elements_count) \
                *     (sum_squares_person2 - pow(sum_person2, 2) / elements_count), 0.5)

    if denominator == 0 :
        return 0
    
    return numerator/denominator

In [9]:
# Source http://stackoverflow.com/a/5713856
def pearsonr(x, y):
    # Assume len(x) == len(y)
    n = len(x)
    sum_x = float(sum(x))
    sum_y = float(sum(y))
    sum_x_sq = sum(map(lambda x: pow(x, 2), x))
    sum_y_sq = sum(map(lambda x: pow(x, 2), y))
    psum = sum(map(lambda x, y: x * y, x, y))
    num = psum - (sum_x * sum_y/n)
    den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
    if den == 0: return 0
    return num / den

In [10]:
def similarity_pearson(critics, person1, person2):
    shared_items = get_shared_items(critics, person1, person2)
    x = [critics[person1][item] for item in shared_items.keys()]
    y = [critics[person2][item] for item in shared_items.keys()]
    
    return pearsonr(x, y)

In [13]:
similarity_pearson_book(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

In [14]:
similarity_pearson(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977

## Matching similar people

Calculate who are the similar persons to a given person



In [15]:
def top_matches(critics, person, n=5, similarity=similarity_pearson):
    scores = [ (similarity(critics, person, other), other) 
              for other in critics if other != person ]
    scores.sort()
    scores.reverse()
    return scores[:n]

In [16]:
top_matches(critics, 'Toby', n=3)

[(0.9912407071619299, 'Lisa Rose'),
 (0.9244734516419049, 'Mick LaSalle'),
 (0.8934051474415647, 'Claudia Puig')]

## Recommending Items

Weighted average to classify unscored books

In [17]:
def get_recommendations(critics, person, similarity=similarity_pearson):
    totals = {}
    sim_sums = {}
    for other in critics:
        if other == person: 
            continue
        
        sim = similarity(critics, person, other)
        if sim <= 0: 
            continue

        for item in critics[other]:
            
            #only for movies I have not watched yet
            if item not in critics[person] or critics[person][item] == 0:
                
                totals.setdefault(item,0)
                totals[item] += critics[other][item] * sim
                
                sim_sums.setdefault(item, 0)
                sim_sums[item] += sim

    #normalizes
    rankings = [ (total/sim_sums[item], item) for item, total in totals.items() ]
    
    rankings.sort()
    rankings.reverse()
    return rankings

Returns a ordered list with a guess of how would Toby score each film he haven't watched yet

In [18]:
get_recommendations(critics=critics, person='Toby')

[(3.3477895267131017, 'The Night Listener'),
 (2.8325499182641614, 'Lady in the Water'),
 (2.530980703765565, 'Just My Luck')]

In [19]:
get_recommendations(critics=critics, person='Toby', similarity=similarity_euclides)

[(3.5002478401415877, 'The Night Listener'),
 (2.7561242939959363, 'Lady in the Water'),
 (2.461988486074374, 'Just My Luck')]

In [20]:
print(critics)

{'Toby': {'You, Me and Dupree': 1.0, 'Superman Returns': 4.0, 'Snakes on a Plane': 4.5}, 'Michael Phillips': {'Lady in the Water': 2.5, 'Superman Returns': 3.5, 'Snakes on a Plane': 3.0, 'The Night Listener': 4.0}, 'Gene Seymour': {'You, Me and Dupree': 3.5, 'Superman Returns': 5.0, 'Lady in the Water': 3.0, 'Just My Luck': 1.5, 'The Night Listener': 3.0, 'Snakes on a Plane': 3.5}, 'Claudia Puig': {'You, Me and Dupree': 2.5, 'Superman Returns': 4.0, 'Just My Luck': 3.0, 'The Night Listener': 4.5, 'Snakes on a Plane': 3.5}, 'Lisa Rose': {'You, Me and Dupree': 2.5, 'Superman Returns': 3.5, 'Lady in the Water': 2.5, 'Just My Luck': 3.0, 'The Night Listener': 3.0, 'Snakes on a Plane': 3.5}, 'Mick LaSalle': {'You, Me and Dupree': 2.0, 'Superman Returns': 3.0, 'Lady in the Water': 3.0, 'Just My Luck': 2.0, 'The Night Listener': 3.0, 'Snakes on a Plane': 4.0}, 'Jack Mattews': {'You, Me and Dupree': 3.5, 'Superman Returns': 5.0, 'Lady in the Water': 3.0, 'The Night Listener': 3.0, 'Snakes on a

In [21]:
print(critics['Toby'])

{'You, Me and Dupree': 1.0, 'Superman Returns': 4.0, 'Snakes on a Plane': 4.5}


# Finding similar products

Instead of people, map the critics by products

In [22]:
def transform_critics(critics):
    result = {}
    for person in critics:
        for item in critics[person]:
            result.setdefault(item, {})
            
            result[item][person] = critics[person][item]
    return result

In [23]:
movies = transform_critics(critics)

In [24]:
print(movies)

{'You, Me and Dupree': {'Toby': 1.0, 'Gene Seymour': 3.5, 'Jack Mattews': 3.5, 'Claudia Puig': 2.5, 'Mick LaSalle': 2.0, 'Lisa Rose': 2.5}, 'Superman Returns': {'Toby': 4.0, 'Michael Phillips': 3.5, 'Gene Seymour': 5.0, 'Jack Mattews': 5.0, 'Claudia Puig': 4.0, 'Mick LaSalle': 3.0, 'Lisa Rose': 3.5}, 'Lady in the Water': {'Mick LaSalle': 3.0, 'Lisa Rose': 2.5, 'Michael Phillips': 2.5, 'Gene Seymour': 3.0, 'Jack Mattews': 3.0}, 'Just My Luck': {'Claudia Puig': 3.0, 'Mick LaSalle': 2.0, 'Gene Seymour': 1.5, 'Lisa Rose': 3.0}, 'The Night Listener': {'Michael Phillips': 4.0, 'Gene Seymour': 3.0, 'Jack Mattews': 3.0, 'Claudia Puig': 4.5, 'Mick LaSalle': 3.0, 'Lisa Rose': 3.0}, 'Snakes on a Plane': {'Toby': 4.5, 'Michael Phillips': 3.0, 'Gene Seymour': 3.5, 'Jack Mattews': 4.0, 'Claudia Puig': 3.5, 'Mick LaSalle': 4.0, 'Lisa Rose': 3.5}}


In [26]:
top_matches(movies, 'Superman Returns')

[(0.6579516949597695, 'You, Me and Dupree'),
 (0.4879500364742689, 'Lady in the Water'),
 (0.11180339887498941, 'Snakes on a Plane'),
 (-0.1798471947990544, 'The Night Listener'),
 (-0.42289003161103106, 'Just My Luck')]

In [27]:
get_recommendations(movies, 'Just My Luck')

[(4.0, 'Michael Phillips'), (3.0, 'Jack Mattews')]