In [21]:
from collections import Counter, defaultdict
import numpy as np

In [22]:
users_interests = [
    ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra'],
    ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres'],
    ['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas'],
    ['R', 'Python', 'statistics', 'regression', 'probability'],
    ['machine learning', 'regression', 'decision trees', 'libsvm'],
    ['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages'],
    ['statistics', 'probability', 'mathematics', 'theory'],
    ['machine learning', 'scikit-learn', 'Mahout', 'neural networks'],
    ['neural networks', 'deep learning', 'Big Data', 'artificial intelligence'],
    ['Hadoop', 'Java', 'MapReduce', 'Big Data'],
    ['statistics', 'R', 'statsmodels'],
    ['C++', 'deep learning', 'artificial intelligece', 'probability'],
    ['pandas', 'R', 'Python'],
    ['databases', 'HBase', 'Postgres', 'MySQL', 'MongoDB'],
    ['libsvm', 'regression', 'support vector machines']
]

In [23]:
popular_interests = Counter(interest for user_interests in users_interests 
                            for interest in user_interests).most_common()
popular_interests

[('R', 4),
 ('Python', 4),
 ('regression', 3),
 ('Big Data', 3),
 ('Java', 3),
 ('probability', 3),
 ('HBase', 3),
 ('statistics', 3),
 ('C++', 2),
 ('Cassandra', 2),
 ('neural networks', 2),
 ('scikit-learn', 2),
 ('Postgres', 2),
 ('deep learning', 2),
 ('MongoDB', 2),
 ('Hadoop', 2),
 ('statsmodels', 2),
 ('pandas', 2),
 ('libsvm', 2),
 ('machine learning', 2),
 ('databases', 1),
 ('MapReduce', 1),
 ('numpy', 1),
 ('MySQL', 1),
 ('mathematics', 1),
 ('Haskell', 1),
 ('programming languages', 1),
 ('theory', 1),
 ('NoSQL', 1),
 ('decision trees', 1),
 ('support vector machines', 1),
 ('Spark', 1),
 ('artificial intelligece', 1),
 ('Mahout', 1),
 ('artificial intelligence', 1),
 ('Storm', 1),
 ('scipy', 1)]

In [24]:
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency) 
                   for interest, frequency in popular_interests 
                   if interest not in user_interests]
    return suggestions[:max_results]

In [25]:
most_popular_new_interests(users_interests[1], 5)

[('R', 4), ('Python', 4), ('regression', 3), ('Big Data', 3), ('Java', 3)]

In [26]:
def cosine_similiarty(v, w):
    return np.dot(v, w) / np.math.sqrt(np.dot(v, v) * np.dot(w, w))

In [27]:
unique_interests = sorted(list({ interest for user_interests in users_interests
                               for interest in user_interests}))
unique_interests

['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligece',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [28]:
def make_user_interest_vector(user_interests):
    """given a list of interests, produce a vector whose ith element is 1 
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in user_interests else 0 for interest in unique_interests]

In [29]:
user_interest_matrix = map(make_user_interest_vector, users_interests)

In [30]:
user_similarities = [[cosine_similiarty(interest_vector_i, interest_vector_j)
                     for interest_vector_j in user_interest_matrix]
                    for interest_vector_i in user_interest_matrix]
user_similarities

[[0.33806170189140661,
  0.0,
  0.0,
  0.0,
  0.15430334996209191,
  0.0,
  0.0,
  0.1889822365046136,
  0.56694670951384085,
  0.0,
  0.0,
  0.0,
  0.1690308509457033,
  0.0]]

In [31]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)
            for other_user_id, similarity in
                enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity > 0]
    
    return sorted(pairs, key=lambda (_, similarity): similarity, reverse=True)

SyntaxError: invalid syntax (<ipython-input-31-a928826ec8d8>, line 7)

In [32]:
def user_based_suggestions(user_id, include_current_interests=False):
    # sum up the similarities
    suggestions = defaultdict(float)
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
    
    # convert them to a sorted list
    suggestions = sorted(suggestions.items(), key=lambda(_, weight): weight, reverse=True)
    
    # and (maybe) exclude already-interests
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
               for suggestion, weight in suggestions 
                   if suggestion not in users_interests[user_id]]

SyntaxError: invalid syntax (<ipython-input-32-3d8b99aa5ef4>, line 9)

In [None]:
user_based_suggestions(0)

# Movie Data

In [None]:
movie_data = [['Superman', 'Walking Dead', 'CSI'], 
              ['Superman', 'Walking Dead', 'CSI']]

In [None]:
unique_movies = sorted(list({ movie for movie_data in movie_data
                               for movie in movie_data}))
unique_movies

In [None]:
def make_user_movie_vector(movie_data):
    """given a list of interests, produce a vector whose ith element is 1 
    if unique_interests[i] is in the list, 0 otherwise"""
    return [1 if interest in movie_data else 0 for interest in unique_movies]

In [None]:
mapped_movie_data = map(make_user_movie_vector, movie_data)

In [None]:
user_similarities = [[cosine_similiarty(interest_vector_i, interest_vector_j)
                     for interest_vector_j in mapped_movie_data]
                    for interest_vector_i in mapped_movie_data]
user_similarities