# Recomender systems

In [107]:
# Imports
from __future__ import division
from collections import *
from functools import *
from pandas import *
from tabulate import tabulate
import math,re,random
import matplotlib.pyplot as plt
import numpy as np


In [108]:
def dot(v,w):
    return sum([v_i*w_i for v_i,w_i in zip(v,w)])
def tableIt(df):
    return tabulate(df,headers="keys",tablefmt="psql")

In [53]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

## One way is to recomend what's popular

In [109]:
#Recommend what's popular
popular_interests = Counter(interest
                          for user_interests in users_interests
                          for interest in user_interests).most_common()
print(tableIt(DataFrame(popular_interests,columns = ['Topic','freq'])))

+----+-------------------------+--------+
|    | Topic                   |   freq |
|----+-------------------------+--------|
|  0 | Python                  |      4 |
|  1 | R                       |      4 |
|  2 | Big Data                |      3 |
|  3 | HBase                   |      3 |
|  4 | Java                    |      3 |
|  5 | statistics              |      3 |
|  6 | regression              |      3 |
|  7 | probability             |      3 |
|  8 | Hadoop                  |      2 |
|  9 | Cassandra               |      2 |
| 10 | MongoDB                 |      2 |
| 11 | Postgres                |      2 |
| 12 | scikit-learn            |      2 |
| 13 | statsmodels             |      2 |
| 14 | pandas                  |      2 |
| 15 | machine learning        |      2 |
| 16 | libsvm                  |      2 |
| 17 | C++                     |      2 |
| 18 | neural networks         |      2 |
| 19 | deep learning           |      2 |
| 20 | artificial intelligence |  

In [17]:
def most_popular_new_interest(user_interest,max_results = 5):
    suggestions = [(interest,frequency)
                  for interest,frequency in popular_interests
                  if interest not in user_interest]
    return suggestions[:5]
print("Recomendation for user 1",most_popular_new_interest(users_interests[1],5))
print("Recomendation for user 3",most_popular_new_interest(users_interests[3],5))

Recomendation for user 1 [('Python', 4), ('R', 4), ('Big Data', 3), ('Java', 3), ('statistics', 3)]
Recomendation for user 3 [('Big Data', 3), ('HBase', 3), ('Java', 3), ('Hadoop', 2), ('Cassandra', 2)]


## User based Collaborative Filtering

In [19]:
def cosine_similarity(v,w):
    return dot(v,w) / math.sqrt(dot(v,v)*dot(w,w))

In [31]:
unique_interests = sorted(list({interest 
                               for user_interest in users_interests
                               for interest in user_interest}))
print(unique_interests)

['Big Data', 'C++', 'Cassandra', 'HBase', 'Hadoop', 'Haskell', 'Java', 'Mahout', 'MapReduce', 'MongoDB', 'MySQL', 'NoSQL', 'Postgres', 'Python', 'R', 'Spark', 'Storm', 'artificial intelligence', 'databases', 'decision trees', 'deep learning', 'libsvm', 'machine learning', 'mathematics', 'neural networks', 'numpy', 'pandas', 'probability', 'programming languages', 'regression', 'scikit-learn', 'scipy', 'statistics', 'statsmodels', 'support vector machines', 'theory']


In [111]:
def make_user_interest_vector(user_interests):
    return [1 if interest in user_interests else 0
           for interest in unique_interests]
user_interest_matrix = list(map(make_user_interest_vector,users_interests))
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                    for interest_vector_j in user_interest_matrix]
                    for interest_vector_i in user_interest_matrix]
print((DataFrame(user_similarities)))

          0         1         2         3         4         5         6   \
0   1.000000  0.338062  0.000000  0.000000  0.000000  0.154303  0.000000   
1   0.338062  1.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.000000  0.000000  1.000000  0.182574  0.000000  0.166667  0.000000   
3   0.000000  0.000000  0.182574  1.000000  0.223607  0.365148  0.447214   
4   0.000000  0.000000  0.000000  0.223607  1.000000  0.000000  0.000000   
5   0.154303  0.000000  0.166667  0.365148  0.000000  1.000000  0.000000   
6   0.000000  0.000000  0.000000  0.447214  0.000000  0.000000  1.000000   
7   0.000000  0.000000  0.204124  0.000000  0.250000  0.000000  0.000000   
8   0.188982  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.566947  0.000000  0.000000  0.000000  0.000000  0.204124  0.000000   
10  0.000000  0.000000  0.235702  0.516398  0.000000  0.235702  0.288675   
11  0.000000  0.000000  0.000000  0.223607  0.000000  0.204124  0.250000   
12  0.000000

In [58]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id,similarity)
            for other_user_id,similarity in enumerate(user_similarities[user_id])
            if user_id != other_user_id and similarity>0]
    return sorted(pairs,key=lambda pair:pair[1],reverse = True)
most_similar_users_to(0)

[(9, 0.5669467095138409),
 (1, 0.3380617018914066),
 (8, 0.1889822365046136),
 (13, 0.1690308509457033),
 (5, 0.1543033499620919)]

In [81]:
def user_based_suggestions(user_id,include_current_interest = False):
    suggesions = defaultdict(float)
    for other_user_id,similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggesions[interest] += similarity
    
    suggesions = sorted(suggesions.items(),key=lambda pair: pair[1],reverse = True)
    
    if include_current_interest:
        return suggesions
    else:
        return [(suggestion,wt)
               for suggestion,wt in suggesions
               if suggestion not in users_interests[user_id]]

print(DataFrame(user_based_suggestions(0),columns = ["uSugges","uSim"]))
print(DataFrame(user_based_suggestions(1)))
print(DataFrame(user_based_suggestions(2)))
        

                    uSugges      uSim
0                 MapReduce  0.566947
1                   MongoDB  0.507093
2                  Postgres  0.507093
3                     NoSQL  0.338062
4           neural networks  0.188982
5             deep learning  0.188982
6   artificial intelligence  0.188982
7                 databases  0.169031
8                     MySQL  0.169031
9                    Python  0.154303
10                        R  0.154303
11                      C++  0.154303
12                  Haskell  0.154303
13    programming languages  0.154303
           0         1
0  databases  0.600000
1      MySQL  0.600000
2     Hadoop  0.338062
3   Big Data  0.338062
4       Java  0.338062
5      Spark  0.338062
6      Storm  0.338062
                        0         1
0                       R  1.056348
1              statistics  0.418276
2        machine learning  0.204124
3                  Mahout  0.204124
4         neural networks  0.204124
5              regression  0.1

## Item-based Collaborative Filtering

In [67]:
interest_user_matrix = [[user_interest_vector[j]
    for user_interest_vector in user_interest_matrix]
    for j, _ in enumerate(unique_interests)]
print(DataFrame(interest_user_matrix))

    0   1   2   3   4   5   6   7   8   9   10  11  12  13  14
0    1   0   0   0   0   0   0   0   1   1   0   0   0   0   0
1    0   0   0   0   0   1   0   0   0   0   0   1   0   0   0
2    1   1   0   0   0   0   0   0   0   0   0   0   0   0   0
3    1   1   0   0   0   0   0   0   0   0   0   0   0   1   0
4    1   0   0   0   0   0   0   0   0   1   0   0   0   0   0
5    0   0   0   0   0   1   0   0   0   0   0   0   0   0   0
6    1   0   0   0   0   1   0   0   0   1   0   0   0   0   0
7    0   0   0   0   0   0   0   1   0   0   0   0   0   0   0
8    0   0   0   0   0   0   0   0   0   1   0   0   0   0   0
9    0   1   0   0   0   0   0   0   0   0   0   0   0   1   0
10   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0
11   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0
12   0   1   0   0   0   0   0   0   0   0   0   0   0   1   0
13   0   0   1   1   0   1   0   0   0   0   0   0   1   0   0
14   0   0   0   1   0   1   0   0   0   0   1   0   1 

In [72]:
# This is the interest similarity matrix for topics
interest_similarities = [[cosine_similarity(user_vector_i,user_vector_j)
                         for user_vector_i in interest_user_matrix]
                        for user_vector_j in interest_user_matrix]
# print(DataFrame(interest_similarities))

In [75]:
def most_similar_interest_to(interest_id):
    similarities = interest_similarities[interest_id]
    pairs = [(unique_interests[other_interest_id],similarity)
            for other_interest_id,similarity in enumerate(similarities)
            if interest_id != other_interest_id and similarity > 0]
    return sorted(pairs,key = lambda pair:pair[1],reverse = True)
print(DataFrame(most_similar_interest_to(0)))

                         0         1
0                   Hadoop  0.816497
1                     Java  0.666667
2                MapReduce  0.577350
3                    Spark  0.577350
4                    Storm  0.577350
5                Cassandra  0.408248
6  artificial intelligence  0.408248
7            deep learning  0.408248
8          neural networks  0.408248
9                    HBase  0.333333


In [80]:
def item_based_suggestions(user_id, include_current_interests=False):
    suggestions = defaultdict(float)
    user_interest_vector = user_interest_matrix[user_id]
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interest_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    suggestions = sorted(suggestions.items(),
                         key=lambda pair: pair[1],
                         reverse=True)

    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]]
print(DataFrame(item_based_suggestions(0),columns = ["iSuggest","iSim"]))

                   iSuggest      iSim
0                 MapReduce  1.861807
1                   MongoDB  1.316497
2                  Postgres  1.316497
3                     NoSQL  1.284457
4                     MySQL  0.577350
5                 databases  0.577350
6                   Haskell  0.577350
7     programming languages  0.577350
8   artificial intelligence  0.408248
9             deep learning  0.408248
10          neural networks  0.408248
11                      C++  0.408248
12                   Python  0.288675
13                        R  0.288675


In [114]:
def printing_sugg(id):
    print("For user ",id,"With interest",users_interests[id])
    print(tableIt(concat(
        [DataFrame(user_based_suggestions(id),columns = ["user based sugg","uSim"]),
         DataFrame(item_based_suggestions(id),columns = ["item based sugg","iSim"])],axis = 1, sort=False)))


In [115]:
for i,_ in enumerate(users_interests):
    printing_sugg(i)

For user  0 With interest ['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
+----+-------------------------+----------+-------------------------+----------+
|    | user based sugg         |     uSim | item based sugg         |     iSim |
|----+-------------------------+----------+-------------------------+----------|
|  0 | MapReduce               | 0.566947 | MapReduce               | 1.86181  |
|  1 | MongoDB                 | 0.507093 | MongoDB                 | 1.3165   |
|  2 | Postgres                | 0.507093 | Postgres                | 1.3165   |
|  3 | NoSQL                   | 0.338062 | NoSQL                   | 1.28446  |
|  4 | neural networks         | 0.188982 | MySQL                   | 0.57735  |
|  5 | deep learning           | 0.188982 | databases               | 0.57735  |
|  6 | artificial intelligence | 0.188982 | Haskell                 | 0.57735  |
|  7 | databases               | 0.169031 | programming languages   | 0.57735  |
|  8 | MySQL