In [74]:
import math


In [75]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]


In [76]:
#内積
def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

#大きさ
def magnitude(v):
    return math.sqrt(dot(v, v))

#類似度計算
def cosine_similarity(v, w):
    return dot(v, w) / magnitude(v) / magnitude(w)


In [77]:
#集合→リスト化→ソート
unique_interests = sorted(list({interest for user_interests in users_interests for interest in user_interests}))
unique_interests


['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [78]:
#ユーザーベース
def make_user_vector(user_id):
    user_interests = users_interests[user_id]
    return [1 if interest in user_interests else 0 for interest in unique_interests]

def user_based_suggestions(user_id, suggestion_num=10):
    suggestions = {}
    similarities = []
    user_vector = make_user_vector(user_id)
    
    for user2_id in range(len(users_interests)):
        if user2_id != user_id:
            user2_vector = make_user_vector(user2_id)
            similarities.append((user2_id, cosine_similarity(user_vector, user2_vector)))
    
    for user2_id, similarity in similarities:
        for interest in users_interests[user2_id]:
            if interest not in suggestions:
                suggestions[interest] = similarity
            else:
                suggestions[interest] += similarity
    
    suggestions = sorted(suggestions.items(), key=lambda key_value: key_value[1], reverse=True)
    num = 0
    index = 0
    
    while num < suggestion_num:
        if suggestions[index][0] not in users_interests[user_id]:
            print(num, ':', suggestions[index][0])
            num += 1
        index += 1
    

In [79]:
user_based_suggestions(0, 10)


0 : MapReduce
1 : MongoDB
2 : Postgres
3 : NoSQL
4 : neural networks
5 : deep learning
6 : artificial intelligence
7 : databases
8 : MySQL
9 : Python


In [80]:
#アイテムベース
def get_column(A, j):
    return [A_i[j] for A_i in A]

users_interests_matrix = [make_user_vector(user_id) for user_id in range(len(users_interests))]
interests_users_matrix = [get_column(users_interests_matrix, interest_id) for interest_id in range(len(unique_interests))]

def make_interest_vector(interest):
    interest_id = unique_interests.index(interest)
    return interests_users_matrix[interest_id]

def item_based_suggestions(user_id, suggestion_num=10):
    suggestions = {}
    similarities = []
    
    for interest in users_interests[user_id]:
        interest_vector = make_interest_vector(interest)
        for interest2 in unique_interests:
            if interest2 != interest:
                interest2_vector = make_interest_vector(interest2)
                similarities.append((interest2, cosine_similarity(interest_vector, interest2_vector)))
    
    for interest2, similarity in similarities:
        if interest2 not in suggestions:
            suggestions[interest2] = similarity
        else:
            suggestions[interest2] += similarity
    
    suggestions = sorted(suggestions.items(), key=lambda key_value: key_value[1], reverse=True)
    num = 0
    index = 0
    
    while num < suggestion_num:
        if suggestions[index][0] not in users_interests[user_id]:
            print(num, ':', suggestions[index][0])
            num += 1
        index += 1
    

In [81]:
item_based_suggestions(0)

0 : MapReduce
1 : MongoDB
2 : Postgres
3 : NoSQL
4 : Haskell
5 : MySQL
6 : databases
7 : programming languages
8 : C++
9 : artificial intelligence
