# Recommender Systems
[original code](https://github.com/joelgrus/data-science-from-scratch/blob/master/code/recommender_systems.py)

In [5]:
from recommender_systems import *

In [4]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

### User-Based Collaborative Filtering

Given an user, the algorithm recommends interests given the interests of another users who are somehow similar to him.
We use cosine_similarity, that measures the angle between 2 vectors v, w. 

    If v and w point in the same direction, is 1 (the numerator and denominator are equal); 
    If their directions are opposite, -1; 
    If one of them is 0 and the other is not, is 0 ( since dot(v, w) is 0).
    

In [2]:
def cosine_similarity(v, w):
    return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))

The vector of user_interests composed by 0s and 1s.

In [25]:
for user_row in user_interest_matrix:
    print " ".join(map(str, user_row))

1 0 1 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0
0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1
0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0
0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
0 0 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [27]:
# Most similar users to user 0
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In order to recommend, we sum up the similarities of the most similar users interested in it.

In [30]:
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity

    # convert them to a sorted list
    suggestions = sorted(suggestions.items(),
                         key=lambda (_, weight): weight,
                         reverse=True)

    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight) 
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]
    
user_based_suggestions(0, False)

[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('programming languages', 0.1543033499620919),
 ('Python', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('R', 0.1543033499620919)]

This approach doesn't work well when the number of item is too large, given that in large-dimensional vector spaces, most vectors are very far apart.

### Item-Based Collaborative Filtering

Computes similarities between interests directly.
We'll need the transpose of user_interest_matrix, so that rows are interests, and columns are users.

Transpose:

In [31]:
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

In [55]:
# Creating a matrix of interests
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

In [52]:
for us in interest_similarities:
    print ",".join(map("{:1.1g}".format, us))

1,0,0.4,0.3,0.8,0,0.7,0,0.6,0,0,0,0,0,0,0.6,0.6,0.4,0,0,0.4,0,0,0,0.4,0,0,0,0,0,0,0,0,0,0,0
0,1,0,0,0,0.7,0.4,0,0,0,0,0,0,0.4,0.4,0,0,0.5,0,0,0.5,0,0,0,0,0,0,0.4,0.7,0,0,0,0,0,0,0
0.4,0,1,0.8,0.5,0,0.4,0,0,0.5,0,0.7,0.5,0,0,0.7,0.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.3,0,0.8,1,0.4,0,0.3,0,0,0.8,0.6,0.6,0.8,0,0,0.6,0.6,0,0.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0.8,0,0.5,0.4,1,0,0.8,0,0.7,0,0,0,0,0,0,0.7,0.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0.7,0,0,0,1,0.6,0,0,0,0,0,0,0.5,0.5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
0.7,0.4,0.4,0.3,0.8,0.6,1,0,0.6,0,0,0,0,0.3,0.3,0.6,0.6,0,0,0,0,0,0,0,0,0,0,0,0.6,0,0,0,0,0,0,0
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.7,0,0.7,0,0,0,0,0,0.7,0,0,0,0,0
0.6,0,0,0,0.7,0,0.6,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0.5,0.8,0,0,0,0,0,1,0.7,0.7,1,0,0,0,0,0,0.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0.6,0,0,0,0,0,0.7,1,0,0.7,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0.7,0.6,0,0,0,0,0,0.7,0,1,0.7,0,0,0,0,0,0,0,0,0,0,0,0,

In [59]:
def most_similar_interests_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,  key=lambda (_, similarity): similarity,
                  reverse=True)

most_similar_interests_to(0)

[('Hadoop', 0.8164965809277261),
 ('Java', 0.6666666666666666),
 ('MapReduce', 0.5773502691896258),
 ('Spark', 0.5773502691896258),
 ('Storm', 0.5773502691896258),
 ('Cassandra', 0.4082482904638631),
 ('artificial intelligence', 0.4082482904638631),
 ('deep learning', 0.4082482904638631),
 ('neural networks', 0.4082482904638631),
 ('HBase', 0.3333333333333333)]

In [54]:
print interest_similarities[4][8]

0.707106781187
