In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [4]:
#------------------
# LOAD THE DATASET
#------------------

data = pd.read_csv('lastfm.csv')

# Create a new dataframe without the user ids.
data_items = data.drop('user', 1)

In [5]:
#------------------------
# ITEM-ITEM CALCULATIONS
#------------------------

# As a first step we normalize the user vectors to unit vectors.

# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_items).sum(axis=1))

# unitvector = (x / magnitude, y / magnitude, z / magnitude, ...)
data_items = data_items.divide(magnitude, axis='index')

def calculate_similarity(data_items):
    """Calculate the column-wise cosine similarity for a sparse
    matrix. Return a new dataframe matrix with similarities.
    """
    data_sparse = sparse.csr_matrix(data_items)
    similarities = cosine_similarity(data_sparse.transpose())
    sim = pd.DataFrame(data=similarities, index= data_items.columns, columns= data_items.columns)
    return sim

# Build the similarity matrix
data_matrix = calculate_similarity(data_items)

print (data_matrix.loc['beyonce'].nlargest(11))
# Lets get the top 11 similar artists for Beyonce
#print data_matrix.loc['beyonce'].nlargest(11)

beyonce               1.000000
the pussycat dolls    0.351871
rihanna               0.334919
christina aguilera    0.308388
alicia keys           0.297264
justin timberlake     0.279726
britney spears        0.269557
leona lewis           0.259237
maria mena            0.248751
kelly clarkson        0.245713
nelly furtado         0.230789
Name: beyonce, dtype: float64


In [6]:
#------------------------
# USER-ITEM CALCULATIONS
#------------------------

user = 5985 # The id of the user for whom we want to generate recommendations
user_index = data[data.user == user].index.tolist()[0] # Get the frame index

# Get the artists the user has likd.
known_user_likes = data_items.iloc[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values

# Users likes for all items as a sparse vector. [010000100010111000000...]
user_rating_vector = data_items.iloc[user_index]

# Calculate the score.
score = data_matrix.dot(user_rating_vector).div(data_matrix.sum(axis=1))

# Remove the known likes from the recommendation.
score = score.drop(known_user_likes)

# Print the known likes and the top 20 recommendations.
print (known_user_likes)
print (score.nlargest(20))

['bob dylan' 'the cure']
joy division           0.026872
the smiths             0.022372
david bowie            0.022337
yann tiersen           0.022326
the rolling stones     0.021417
tom waits              0.019191
eric clapton           0.018236
misfits                0.018096
led zeppelin           0.016528
belle and sebastian    0.015974
elliott smith          0.015651
jimi hendrix           0.015266
the national           0.013065
ramones                0.012192
the beatles            0.012107
mogwai                 0.011506
the doors              0.011391
sufjan stevens         0.011298
bruce springsteen      0.010952
the clash              0.010874
dtype: float64


In [8]:
#------------------------
# USER-ITEM CALCULATIONS(Using Neighbor)
#------------------------

# Construct a new dataframe with the 10 closest neighbours (most similar)
# for each artist.
data_neighbours = pd.DataFrame(index=data_matrix.columns, columns=range(1,11))
for i in range(0, len(data_matrix.columns)):
    #ascending=false 代表用降冪排列 由大排到小
    #這段迴圈就是用找出跟item相關度高的neighbor
    data_neighbours.iloc[i,:10] = data_matrix.iloc[0:,i].sort_values(ascending=False)[:10].index
    """    
      1 2 3 4 5 6 7 8 9 10
    a 十個相似度最高的item的index!!這邊還沒有存值
    b
    c
    d
    e
    """
user = 5985
user_index = data[data.user == user].index.tolist()[0]

# Get the artists the user has played.
known_user_likes = data_items.iloc[user_index]
known_user_likes = known_user_likes[known_user_likes >0].index.values


# Construct the neighbourhood from the most similar items to the
# ones our user has already liked.
#從dataneighbor中選出user喜歡的item
"""
      1 2 3 4 5 6 7 8 9 10
    a 9 8 7 6 5 4 3 2 1 10
    c 4 3 2 1 7 8 9 5 6 10
    e 3 2 4 5 6 7 8 1 9 10 
"""
most_similar_to_likes = data_neighbours.loc[known_user_likes]
#將dataframe拆解成list 

"""
    [a[9 8 7 6 5 4 3 2 1 10],c[4 3 2 1 7 8 9 5 6 10],e[3 2 4 5 6 7 8 1 9 10]]
"""

similar_list = most_similar_to_likes.values.tolist()
#用一個巢狀迴圈把每個sublist中的資料讀入一個set去除重複後,再存為一個list，因此最後similar list如下

"""
    similarlist 
    [9 8 7 6 5 4 3 2 1 10 4 3 2 1 7 8 9 5 6 10 3 2 4 5 6 7 8 1 9 10]
             ||
             ˇˇ
    [9 8 7 6 5 4 3 2 1 10]
"""

similar_list = list(set([item for sublist in similar_list for item in sublist]))
#把是neighbor的從data matrix挑出來，也可以說用similar list來找出一個小的data matrix，
#我們叫這個data matrix "neighbourhood"
#後面就可以把這個neighborhood matrix 當作data matrix來處理了
neighbourhood = data_matrix[similar_list].loc[similar_list]

# A user vector containing only the neighbourhood items and
# the known user likes.
user_vector = data_items.loc[user_index].loc[similar_list]

# Calculate the score.
score = neighbourhood.dot(user_vector).div(neighbourhood.sum(axis=1))

# Drop the known likes.
score = score.drop(known_user_likes)

print (known_user_likes)
print (score.nlargest(20))

['bob dylan' 'the cure']
joy division           0.087840
the smiths             0.087004
the rolling stones     0.084162
david bowie            0.081768
tom waits              0.075365
belle and sebastian    0.070918
eric clapton           0.069710
misfits                0.069017
the beatles            0.067101
elliott smith          0.067058
ramones                0.064136
jimi hendrix           0.060558
depeche mode           0.057806
johnny cash            0.055420
the doors              0.047377
dtype: float64


In [9]:
data_matrix

Unnamed: 0,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
a perfect circle,1.000000,0.000000,0.009250,0.032188,0.066869,0.000000,0.038886,0.039923,0.000000,0.000000,...,0.029562,0.057362,0.349047,0.106208,0.018213,0.079469,0.017868,0.067862,0.044661,0.000000
abba,0.000000,1.000000,0.024286,0.009154,0.029176,0.000000,0.005186,0.026254,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.020101,0.014697,0.000000,0.055088,0.000000,0.010399,0.000000
ac/dc,0.009250,0.024286,1.000000,0.072087,0.148919,0.058515,0.057011,0.022184,0.000000,0.055834,...,0.019406,0.039212,0.031971,0.017305,0.000000,0.076235,0.084368,0.008948,0.081664,0.000000
adam green,0.032188,0.009154,0.072087,1.000000,0.038816,0.000000,0.107045,0.000000,0.000000,0.013226,...,0.000000,0.111741,0.055024,0.028686,0.093177,0.013753,0.012631,0.000000,0.010993,0.056582
aerosmith,0.066869,0.029176,0.148919,0.038816,1.000000,0.000000,0.073653,0.071938,0.000000,0.032899,...,0.037667,0.017707,0.011698,0.034734,0.038125,0.000000,0.163532,0.000000,0.054068,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
trivium,0.079469,0.000000,0.076235,0.013753,0.000000,0.055084,0.000000,0.000000,0.062955,0.000000,...,0.025153,0.000000,0.042274,0.000000,0.017425,1.000000,0.010879,0.101321,0.028567,0.000000
u2,0.017868,0.055088,0.084368,0.012631,0.163532,0.026011,0.050531,0.076246,0.000000,0.024590,...,0.008318,0.053003,0.037373,0.036630,0.058571,0.010879,1.000000,0.000000,0.010628,0.000000
underoath,0.067862,0.000000,0.008948,0.000000,0.000000,0.120521,0.000000,0.000000,0.204676,0.000000,...,0.013037,0.020995,0.032726,0.014635,0.000000,0.101321,0.000000,1.000000,0.000000,0.000000
volbeat,0.044661,0.010399,0.081664,0.010993,0.054068,0.018697,0.021082,0.000000,0.000000,0.000000,...,0.010321,0.014300,0.047499,0.000000,0.000000,0.028567,0.010628,0.000000,1.000000,0.026184


In [10]:
data.head()

Unnamed: 0,user,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,42,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,51,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
data_items.head()

Unnamed: 0,a perfect circle,abba,ac/dc,adam green,aerosmith,afi,air,alanis morissette,alexisonfire,alicia keys,...,timbaland,tom waits,tool,tori amos,travis,trivium,u2,underoath,volbeat,yann tiersen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.204124,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
