<a href="https://colab.research.google.com/github/lkarjun/Data-Science-from-Scratch/blob/master/23%20Recommender%20Systems/recommender_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Recommending system

In [15]:
users_interests = [["Hadoop", "Big Data", "HBase", "Java", "Spark","Storm", "Cassandra"],
                   ["NoSQL", "MongoDB", "Cassandra", "HBase","Postgres"],
                   ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
                   ["R", "Python", "statistics", "regression", "probability"],
                   ["machine learning", "regression", "decision trees", "libsvm"],
                   ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
                   ["statistics", "probability", "mathematics", "theory"],
                   ["machine learning", "scikit-learn", "Mahout", "neural networks"],
                   ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
                   ["Hadoop", "Java", "MapReduce", "Big Data"],
                   ["statistics", "R", "statsmodels"],
                   ["C++", "deep learning", "artificial intelligence", "probability"],
                   ["pandas", "R", "Python"],
                   ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
                   ["libsvm", "regression", "support vector machines"]]

### Recommending what's popular

In [3]:
from collections import Counter

In [4]:
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests)

In [5]:
from typing import List, Tuple

In [6]:
def most_popular_new_interests(
                user_interests: List[str],
                max_results: int = 5) -> List[Tuple[str, int]]:

                suggestions = [(interest, frequency)
                                for interest, frequency in popular_interests.most_common()
                                if interest not in user_interests]

                return suggestions[:max_results]


In [7]:
most_popular_new_interests(users_interests[1])

[('Python', 4), ('R', 4), ('Big Data', 3), ('Java', 3), ('statistics', 3)]

### User-Based Collabrative Filtering

In [8]:
unique_interests = sorted({interest for user_interests in users_interests
                           for interest in user_interests})

In [9]:
unique_interests[:4]

['Big Data', 'C++', 'Cassandra', 'HBase']

In [10]:
def make_user_interest_vector(user_interests: List[str]) -> List[int]:
  '''given a list of interests, produce a vector whose ith element is 1
     if unique_interests[i] is in the list, 0 otherwise'''
  return [1 if interest in user_interests else 0
             for interest in unique_interests]

In [17]:
user_interest_vectors = [make_user_interest_vector(user_interests)
                         for user_interests in users_interests]

In [20]:
from linearalgebra import dot
import math

def cosine_similarity(v1, v2) -> float:
    return dot(v1, v2) / math.sqrt(dot(v1, v1) * dot(v2, v2))


user_similarities = [[cosine_similarity(interest_vector_i,                                                      interest_vector_j)
                      for interest_vector_j in user_interest_vectors]
                      for interest_vector_i in user_interest_vectors]

In [21]:
def most_similar_users_to(user_id: int) -> List[Tuple[int, float]]:
    pairs = [(other_user_id, similarity)
            for other_user_id, similarity in enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity > 0]

    return sorted(pairs, key=lambda pair: pair[1], reverse=True)

In [22]:
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In [23]:
from collections import defaultdict

In [26]:
def user_based_suggestions(user_id: int, include_current_interests: bool = False):
    suggestions: Dict[str, float] = defaultdict(float)
    
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity
    
    suggestions = sorted(suggestions.items(), 
                         key=lambda pairs: pairs[-1], reverse=True)
    
    if include_current_interests:
        return suggestions
    else:
        return [(suggestions, weight)
                for suggestions, weight in suggestions
                if suggestions not in users_interests[user_id]]

In [27]:
user_based_suggestions(0)

[('MapReduce', 0.5669467095138409),
 ('MongoDB', 0.50709255283711),
 ('Postgres', 0.50709255283711),
 ('NoSQL', 0.3380617018914066),
 ('neural networks', 0.1889822365046136),
 ('deep learning', 0.1889822365046136),
 ('artificial intelligence', 0.1889822365046136),
 ('databases', 0.1690308509457033),
 ('MySQL', 0.1690308509457033),
 ('Python', 0.1543033499620919),
 ('R', 0.1543033499620919),
 ('C++', 0.1543033499620919),
 ('Haskell', 0.1543033499620919),
 ('programming languages', 0.1543033499620919)]