Recommending System

In [2]:

users_interests = [["Hadoop", "Big Data", "HBase", "Java", "Spark","Storm", "Cassandra"],
                   ["NoSQL", "MongoDB", "Cassandra", "HBase","Postgres"],
                   ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
                   ["R", "Python", "statistics", "regression", "probability"],
                   ["machine learning", "regression", "decision trees", "libsvm"],
                   ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
                   ["statistics", "probability", "mathematics", "theory"],
                   ["machine learning", "scikit-learn", "Mahout", "neural networks"],
                   ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
                   ["Hadoop", "Java", "MapReduce", "Big Data"],
                   ["statistics", "R", "statsmodels"],
                   ["C++", "deep learning", "artificial intelligence", "probability"],
                   ["pandas", "R", "Python"],
                   ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
                   ["libsvm", "regression", "support vector machines"]]

## Recommending what's popular

In [3]:
from collections import Counter

In [4]:
popular_interests = Counter(interest
                        for user_interests in users_interests
                        for interest in user_interests)

In [5]:
popular_interests.most_common()[:5]

[('Python', 4), ('R', 4), ('Big Data', 3), ('HBase', 3), ('Java', 3)]

In [6]:
from typing import List, Tuple

In [7]:
def most_popular_new_interestes(
    user_interest: List[str], max_result: int = 5) -> List[Tuple[str, int]]:

    suggestions = [interest
                   for interest, _ in popular_interests.most_common()
                   if interest not in user_interest]
    
    return suggestions[:max_result]


In [8]:
most_popular_new_interestes([users_interests[4]])

['Python', 'R', 'Big Data', 'HBase', 'Java']

## User based collaborative Filtering

In [9]:
unique_interests = sorted({interest
                            for user_interest in users_interests
                            for interest in user_interest})
                            

In [12]:
unique_interests[:3]

['Big Data', 'C++', 'Cassandra']

In [39]:
def make_user_interest_vector(user_interests: List[str])-> List[str]:
    return [1 if interest in user_interests else 0
            for interest in unique_interests]


print(make_user_interest_vector(users_interests[3]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0]


In [18]:
users_interest_vectors = [make_user_interest_vector(user_interest)
                            for user_interest in users_interests]

In [21]:
print(users_interest_vectors[:3])

[[1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0]]


In [28]:
from numpy import dot
import math

In [29]:
def cosine_similarity(v1, v2) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))

In [35]:
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)

                    for interest_vector_i in users_interest_vectors]
                    for interest_vector_j in users_interest_vectors
]

In [63]:
def most_similar_users_to(user_id: int) -> List[Tuple[int, float]]:

    pairs = [(other_user_id, similarity)
              for other_user_id, similarity in enumerate(user_similarities[user_id])
              if user_id != other_user_id and similarity >0]
    return sorted(pairs, key=lambda x: x[1], reverse=True)

In [64]:
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]