Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
151 lines (125 sloc) 6.24 KB
#coding:utf-8
from __future__ import division
from collections import Counter
print "---有興趣的有哪些---"
users_interests = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]
popular_interests = Counter(interest
for user_interests in users_interests
for interest in user_interests).most_common()
print "Popular Interests"
print popular_interests
print "---用大眾資料推薦你適合甚麼---"
def most_popular_new_interests(user_interests, max_results=5):
suggestions = [(interest, frequency)
for interest, frequency in popular_interests
if interest not in user_interests]
return suggestions[:max_results]
print "Most Popular New Interests"
print "already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]
print most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
print "Me like Python Java Hbase"
print most_popular_new_interests(["Python","Java","Hbase"])
# 用相似的人與你喜歡進行分析,算出哪個跟你最相似
print "---與誰最相似---"
from linear_algebra import dot
import math
unique_interests = sorted(list({ interest
for user_interests in users_interests
for interest in user_interests }))
def make_user_interest_vector(user_interests):
"""given a list of interests, produce a vector whose i-th element is 1
if unique_interests[i] is in the list, 0 otherwise"""
return [1 if interest in user_interests else 0
for interest in unique_interests]
# 兩兩做比對,算出哪個跟你最相近
def cosine_similarity(v, w):
return dot(v, w) / math.sqrt(dot(v, v) * dot(w, w))
user_interest_matrix = map(make_user_interest_vector, users_interests)
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
for interest_vector_j in user_interest_matrix]
for interest_vector_i in user_interest_matrix]
def most_similar_users_to(user_id):
pairs = [(other_user_id, similarity) # find other
for other_user_id, similarity in # users with
enumerate(user_similarities[user_id]) # nonzero
if user_id != other_user_id and similarity > 0] # similarity
return sorted(pairs, # sort them
key=lambda (_, similarity): similarity, # most similar
reverse=True) # first
print "User based similarity"
print "most similar to 11",most_similar_users_to(11)
print "---以誰為基準進行推薦---"
from collections import defaultdict
def user_based_suggestions(user_id, include_current_interests=False):
# sum up the similarities
suggestions = defaultdict(float)
for other_user_id, similarity in most_similar_users_to(user_id):
for interest in users_interests[other_user_id]:
suggestions[interest] += similarity
# convert them to a sorted list
suggestions = sorted(suggestions.items(),
key=lambda (_, weight): weight,
reverse=True)
# and (maybe) exclude already-interests
if include_current_interests:
return suggestions
else:
return [(suggestion, weight)
for suggestion, weight in suggestions
if suggestion not in users_interests[user_id]]
print "Suggestions for 10"
print user_based_suggestions(10)
print "---以項目為基準進行興趣分析---"
interest_user_matrix = [[user_interest_vector[j]
for user_interest_vector in user_interest_matrix]
for j, _ in enumerate(unique_interests)]
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
for user_vector_j in interest_user_matrix]
for user_vector_i in interest_user_matrix]
def most_similar_interests_to(interest_id):
similarities = interest_similarities[interest_id]
pairs = [(unique_interests[other_interest_id], similarity)
for other_interest_id, similarity in enumerate(similarities)
if interest_id != other_interest_id and similarity > 0]
return sorted(pairs,
key=lambda (_, similarity): similarity,
reverse=True)
print "Item based similarity"
print "most similar to 'Big Data'"
print most_similar_interests_to(0)
print "---以項目為基準進行推薦分析---"
def item_based_suggestions(user_id, include_current_interests=False):
suggestions = defaultdict(float)
user_interest_vector = user_interest_matrix[user_id]
for interest_id, is_interested in enumerate(user_interest_vector):
if is_interested == 1:
similar_interests = most_similar_interests_to(interest_id)
for interest, similarity in similar_interests:
suggestions[interest] += similarity
suggestions = sorted(suggestions.items(),
key=lambda (_, similarity): similarity,
reverse=True)
if include_current_interests:
return suggestions
else:
return [(suggestion, weight)
for suggestion, weight in suggestions
if suggestion not in users_interests[user_id]]
print "suggestions for user 0"
print item_based_suggestions(0)