# First steps following the book

In [8]:
critics = {
    'Lisa Rose': {
        'Lady in the Water': 2.5, 
        'Snakes on a Plane': 3.5, 
        'Just My Luck': 3.0, 
        'Superman Returns': 3.5, 
        'You, Me and Dupree': 2.5, 
        'The Night Listener': 3.0,
    },
    'Gene Seymour': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 3.5,
        'Just My Luck': 1.5,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
        'The Night Listener': 3.0,
    },
    'Michael Phillips': {
        'Lady in the Water': 2.5,
        'Snakes on a Plane': 3.0, 
        'Superman Returns': 3.5, 
        'The Night Listener': 4.0,
    },
    'Claudia Puig': {
        'Snakes on a Plane': 3.5,
        'Just My Luck': 3.0,
        'The Night Listener': 4.5,
        'Superman Returns': 4.0,
        'You, Me and Dupree': 2.5,
    },
    'Mick LaSalle': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0, 
        'Just My Luck': 2.0, 
        'Superman Returns': 3.0, 
        'The Night Listener': 3.0,
        'You, Me and Dupree': 2.0,
    },
    'Jack Mattews': {
        'Lady in the Water': 3.0,
        'Snakes on a Plane': 4.0, 
        'The Night Listener': 3.0,
        'Superman Returns': 5.0,
        'You, Me and Dupree': 3.5,
    },
    'Toby': {
        'Snakes on a Plane': 4.5,
        'You, Me and Dupree': 1.0,
        'Superman Returns': 4.0,
    }
}

In [9]:
critics['Lisa Rose']['Lady in the Water']

2.5

In [10]:
critics['Toby']

{'Snakes on a Plane': 4.5, 'Superman Returns': 4.0, 'You, Me and Dupree': 1.0}

## Finding similar users

TODO: draw a plot y = movie1, x = movie2 for all users



### Euclidean similarity distance

0 if nothing in common, 1 if equal

In [27]:
def get_shared_items(scores, person1, person2):
    shared_items = {}
    for item in scores[person1]:
        if item in scores[person2]:
            shared_items[item] = True
    return shared_items

In [33]:
def similarity_euclides(scores, person1, person2):
    shared_items = get_shared_items(scores, person1, person2)
    if len(shared_items) == 0:
        return 0
    
    sum_of_squares = sum(pow(scores[person1][item] - scores[person2][item], 2)
                        for item in shared_items.keys())
    
    return 1/(1 + sum_of_squares)

In [34]:
similarity_euclides(critics, 'Lisa Rose', 'Gene Seymour')

0.14814814814814814

In [35]:
similarity_euclides(critics, 'Mick LaSalle', 'Mick LaSalle')

1.0

### Pearson similarity distance

First locates items that got scores on both. Then calculates sums and sums of squares, and sum of products. And use those sums to calculate Pearson correlation.


Returns values between -1 and 1. 

In [42]:
def similarity_pearson(scores, person1, person2):
    shared_items = get_shared_items(scores, person1, person2)
    
    elements_count = len(shared_items)
    if elements_count == 0:
        return 0
    
    sum_person1 = sum([scores[person1][item] for item in shared_items.keys()])
    sum_person2 = sum([scores[person2][item] for item in shared_items.keys()])
    
    sum_squares_person1 = sum([pow(scores[person1][item], 2) for item in shared_items.keys()])
    sum_squares_person2 = sum([pow(scores[person2][item], 2) for item in shared_items.keys()])
    
    sum_products = sum([scores[person1][item] * scores[person1][item] for item in shared_items.keys()])
    
    numerator = sum_products - (sum_person1 * sum_person2 / elements_count)
    denominator = pow((sum_squares_person1 - pow(sum_person1, 2) / elements_count) \
                *     (sum_squares_person2 - pow(sum_person2, 2) / elements_count), 0.5)

    if denominator == 0 :
        return 0
    
    return numerator/denominator

In [45]:
# Source http://stackoverflow.com/a/5713856
def pearsonr(x, y):
    # Assume len(x) == len(y)
    n = len(x)
    sum_x = float(sum(x))
    sum_y = float(sum(y))
    sum_x_sq = sum(map(lambda x: pow(x, 2), x))
    sum_y_sq = sum(map(lambda x: pow(x, 2), y))
    psum = sum(map(lambda x, y: x * y, x, y))
    num = psum - (sum_x * sum_y/n)
    den = pow((sum_x_sq - pow(sum_x, 2) / n) * (sum_y_sq - pow(sum_y, 2) / n), 0.5)
    if den == 0: return 0
    return num / den

In [48]:
def similarity_pearson_stackoverflow(scores, person1, person2):
    shared_items = get_shared_items(scores, person1, person2)
    x = [scores[person1][item] for item in shared_items.keys()]
    y = [scores[person2][item] for item in shared_items.keys()]
    
    return pearsonr(x, y)

In [49]:
similarity_pearson(critics, 'Lisa Rose', 'Gene Seymour')

-1.3862065601673441

In [50]:
similarity_pearson_stackoverflow(critics, 'Lisa Rose', 'Gene Seymour')

0.39605901719066977