In [1]:
import pandas as pd
import numpy as np
import math

# 1 Data Cleaning

In [2]:
artists = "artists.dat"
tags = "tags.txt"
user_artists = "user_artists.dat"
user_friends = "user_friends.dat"
user_taggedartists_timestamps = "user_taggedartists-timestamps.dat"
user_taggedartists = "user_taggedartists.dat"

In [3]:
total_table = pd.read_table(user_taggedartists)
user_artists_df = pd.read_table(user_artists)
tags_df = pd.read_table(tags,encoding = "utf-8")
user_friends_df = pd.read_table(user_friends)
artists_df = pd.read_table(artists)

In [4]:
user_artists_df.head()
artists_df.columns = ["artistID","name","url","pictureURL"]
total_table = pd.merge(user_artists_df,artists_df,left_on = "artistID",right_on = "artistID")

In [5]:
df = total_table

## Remove noise

- ** Remove users whose frequent listening artists less than 25 **
- ** Data from 92837 to 92533 **

In [6]:
g = df.groupby('userID') 
df = g.filter(lambda x: len(x) > 25)

## Normalizing data
As the listening frequencies of each users differ a lot from zero to 10 thousand, we try to normalize data in two ways:
>**Deal it with bianry data**: Regard all weighted artists as one and unweighted as zero

>**Normalized it with max value**:
>- Weight data shows artists users often listen which means there only artists users like
>- Use freq = weight(i)/max(weight)

In [7]:
df["weight_a"] = df["weight"]
df["weight_a"][df["weight_a"] > 0] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [8]:
unique_ids = sorted(set(df.artistID))
unique_users = sorted(set(df.userID))
artist_ix = {id:ix for (ix,id) in enumerate(unique_ids)}
user_ix = {id:ix for (ix,id) in enumerate(unique_users)}
df['artist_ix'] = df.artistID.map(artist_ix)
df['user_ix'] = df.userID.map(user_ix)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,userID,artistID,weight,name,url,pictureURL,weight_a,artist_ix,user_ix
0,2,51,13883,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,0
1,4,51,228,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,2
2,27,51,85,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,24
3,28,51,10,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,25
4,62,51,528,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,58


In [9]:
def norm(values):
    return values/values.max()
df['freq'] = df.groupby('user_ix').weight.transform(norm)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,userID,artistID,weight,name,url,pictureURL,weight_a,artist_ix,user_ix,freq
0,2,51,13883,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,0,1.0
1,4,51,228,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,2,0.045756
2,27,51,85,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,24,0.208333
3,28,51,10,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,25,0.37037
4,62,51,528,Duran Duran,http://www.last.fm/music/Duran+Duran,http://userserve-ak.last.fm/serve/252/155668.jpg,1,45,58,0.076323


In [10]:
#creat a dicitonary to store artists' information
df2 =df
artist_name = df2.set_index('artistID')['name'].to_dict()

In [11]:
df.to_pickle("user_data.pickle")

# 2 Memory-based model
>Use svds to decompose and convert it to k=20 dimensions.

>Use normalized frequency to predict the probability users would like the artisist ans recommend first 12 artists to user.

In [12]:
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds
def svd(df):
    m = csc_matrix((df.freq,(df.user_ix,df.artist_ix)))
    u,s,vt = svds(m,k=20)
    pred = np.dot(np.dot(u,np.diag(s)),vt)
    return pred

In [13]:
def artist_now(userID):
    artist_ = df[df.userID==userID].sort_values("weight",ascending=False)
    music_now_list = artist_.head(40).name
    return music_now_list.tolist()

In [14]:
def user_recommend1(userID,pred):
    recommended = np.argsort(pred[user_ix.get(userID),:])[-12:]
    recommendations_list = [artist_name.get(r) for r in recommended]
    return recommendations_list

In [15]:
usedID = int(input("Enter a new userID (chose a number below 2100):"))
print("-"*30)
a_now = artist_now(usedID)
print("Artists you like now:")
print(a_now)

pred = svd(df)
recommendation = user_recommend1(usedID,pred)
print("-"*30)
print("We recommend you these artists:")
print(recommendation)

Enter a new userID (chose a number below 2100):1196
------------------------------
Artists you like now:
['Nirvana', 'Engenheiros do Hawaii', 'Puddle of Mudd', 'Foo Fighters', 'Soundgarden', 'Institute', 'Alice in Chains', 'System of a Down', 'Silverchair', 'Living Colour', 'P.O.D.', 'Bush', 'Paramore', 'Pearl Jam', 'Hoobastank', 'Nullset', 'Nickelback', 'Green Day', 'Alter Bridge', 'Rage Against the Machine', 'Evanescence', 'Red Hot Chili Peppers', 'Limp Bizkit', 'Audioslave', 'Meat Puppets', 'The Pretty Reckless', 'Pouca Vogal', 'Metallica', 'Mudhoney', 'The Offspring', '4 Non Blondes', 'Chevelle', 'Stone Temple Pilots', 'Weezer', 'Humberto Gessinger Trio', 'Capital Inicial', 'Pitty', 'Chris Cornell', 'SoundGarden | www.CdsCompletos.net', 'The Cure']
------------------------------
We recommend you these artists:
['U2', 'Red Hot Chili Peppers', 'Yeong-wook Jo', 'Elton John', 'Hevia', 'The Boats', 'Shakira', 'DeVotchKa', 'Kate Nash', 'Tortoise', 'Funeral for a Friend', 'Kings of Leon']

# 3 Tags-based recommendation system(Cosine Similarity)
>Try to find recommendation artists according to tags.

>If user A and user B give 50 tags to 50 artists individually and 49 tags are same, then we assume they have similar taste in music and recommend different set of both favorite artists to each other.

In [20]:
pd.options.display.max_colwidth = 10000

In [21]:
#Setting a total table of merge 2 sub tables
total_table = pd.read_table(user_taggedartists)
user_artists_df = pd.read_table(user_artists)
tags_df = pd.read_table(tags,encoding = "utf-8")
user_friends_df = pd.read_table(user_friends)

total_table.drop(["day","month","year"],axis = 1,inplace = True)
total_table = pd.merge(total_table,tags_df,on = "tagID")

def fun(word):
    word = "%s" % ' '.join(word)
    return word

artist_tags = total_table.groupby("artistID")["tagValue"].apply(fun)

list_word = []
for i in artist_tags[0:]:
    list_word.append(list(set(str.split(i))))
    
new_artist_tags = list(map(" ".join,list_word))
artist_tags_df = pd.DataFrame(artist_tags)
artist_tags_df["tagValue"] = new_artist_tags

In [22]:
artist_tags_df['artistID'] = artist_tags_df.index
total_table = total_table.drop("tagValue",axis = 1)
total_table = pd.merge(artist_tags_df,total_table,how = "left")
total_table = total_table[["userID","artistID","tagID","tagValue"]]

total_table.head()
total_table_c = total_table.drop_duplicates(subset = ["userID","artistID","tagValue"])
if __name__ == '__main__':
    total_table_c.drop("tagID",axis = 1,inplace = True)
total_table_c.head()
total_table = pd.merge(total_table_c,user_artists_df,how="left")
total_table.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,userID,artistID,tagValue,weight
0,681,1,than lady better visual weeabo jrock j-rock gaga gothic kei japanese,
1,1545,1,than lady better visual weeabo jrock j-rock gaga gothic kei japanese,
2,1730,1,than lady better visual weeabo jrock j-rock gaga gothic kei japanese,
3,1929,1,than lady better visual weeabo jrock j-rock gaga gothic kei japanese,
4,1984,1,than lady better visual weeabo jrock j-rock gaga gothic kei japanese,


In [23]:
total_table_c = total_table.fillna(0)
artist_df = pd.read_table(artists)

artist_df["artistID"] = artist_df["id"]
artist_df = artist_df.reindex(columns=['artistID', 'name', 'url', "pictureURL", "id"])
artist_df = artist_df.drop("id",axis = 1)

In [24]:
total_table_c = pd.merge(total_table_c,artist_df,on = "artistID")
unique_artistids = sorted(set(total_table_c.artistID))
unique_users = sorted(set(total_table_c.userID))
artist_ix = {id:ix for (ix,id) in enumerate(unique_artistids)}
user_ix = {id:ix for (ix,id) in enumerate(unique_users)}

total_table_c["artist_ix"] = total_table_c.artistID.map(artist_ix)
total_table_c["user_ix"] = total_table_c.userID.map(user_ix)
total_table_c.head()

list_total_tag = ["electronic jazz folk indie rock pop j-pop j-rock k-pop j-pop female vocalists rock dance 80s piano instrument alsmooth jazz swing saxophone singer-songwriter acoustic alternative alternative rockclassic rockhard rock japanese anime Visual Kei death metal dark heavy industrial classical instrumental experimental russian country blues abstract downtempo idm electronica ambient hip-hop 90s"]
combined_tags = [] 
list_total_tag
for i in list(total_table_c.tagValue):
    for j in list_total_tag:
        combined_tags.append(list(set(i.split()).intersection(set(j.split()))))
        
len(combined_tags)
content = list(map(" ".join,combined_tags))
total_table_c['tags_dealed'] = pd.Series(content)

In [25]:
def fun(word):
    word = "%s" % ' '.join(word)
    return word

total_table_c[total_table_c.userID == 2]
all_tags = total_table_c.groupby("user_ix").tags_dealed.apply(fun)
total_table_d = pd.DataFrame(all_tags)
total_table_d["user_ix"] = total_table_d.index
list_none_repeated = list(map(set,list(map(str.split,total_table_d.tags_dealed))))
S_none_repeated = pd.Series(list(map(list,list_none_repeated)))
total_table_d["dealed"] = pd.Series(list(map(" ".join,S_none_repeated)))

In [26]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

tf = TfidfVectorizer(ngram_range = (1,3),stop_words = 'english')
dtm = tf.fit_transform(total_table_d.dealed)

In [27]:
#demension reduction
from scipy.sparse.linalg import svds
u1,s1,vt1 = svds(dtm , k = 100)
u1.shape, s1.shape, vt1.shape

((1891, 100), (100,), (100, 2309))

In [28]:
#Similarity
pred = cosine_similarity(u1)

In [29]:
#Type a old user
old_user = input("Choose a old user:0~2000 : ")

index = pred[int(old_user)].argsort()[-2]
#total_table_c[total_table_c["user_ix"] == index]

### Tags they have
words = " ".join(total_table_c[total_table_c["user_ix"] == int(old_user)].tags_dealed)
list(set(words.split()))
 
words = " ".join(total_table_c[total_table_c["user_ix"] == index].tags_dealed)
list(set(words.split()))

#recommend different set of similar user to this old user

Series1 = total_table_c[total_table_c.user_ix == int(old_user)]["name"]
Series2 = total_table_c[total_table_c.user_ix == index]["name"]

difference = (list(set(Series2).difference(set(Series1)))) #Use weight to rank this difference set

total_e = total_table_c[total_table_c.user_ix == int(index)][["name","weight"]]
total_e.sort_values("weight",ascending = False)

difference_df = pd.DataFrame(difference,columns = ["name"])
recommendation_result = difference_df.merge(total_e,on = "name").sort_values("weight",ascending = False)[0:10]

print("recommendation result")
print("-"*30)
print(list(recommendation_result.name))

Choose a old user:0~2000 : 1196
recommendation result
------------------------------
['Marillion', 'Pink Floyd', 'Gary Numan', 'Japan', 'It Bites', 'The Human League', 'RealTobacco', 'Riverside', 'The Stranglers', 'Ladytron']


# 4 Collaborative filtering: User-based model

Use user similarity to find user whose tastes are most similiar:
>We find users who have the same artists as our target-user often listen to.

>We use **euclidean algorithm** to calculate these users' distance and return all distances.

>To remove noises and reduce calculations, we find the top three most similar user, and find four artists they mostly listen to, respectively.

>We do not use the combination of users' listening frequency on artists, as we regard more higher the similarity, more likely two users have same taste. Moreover, what we want is a system recommend for **niche listeners**, if we do a combination, it would be hard to find some artists only listened by one or two users. So we only **recommend a sets of artists with no rank**. 

>Finally, we recommend about 12 artists to user.

In [40]:
def similarity_score(person1,person2):
    item1 = set(df[df.userID==person1].artistID)
    item2 = set(df[df.userID==person2].artistID)
    common_items = set.intersection(*[item1, item2])
    #find whether there are common artists
    if len(common_items)==0:
        return 0
    #seek the freq data in pairs of users of these common artists
    freq1 = [];freq2 = []
    for item in common_items: 
        freq1.append(float(df[(df.userID==person1)&(df.artistID==item)].freq))
        freq2.append(float(df[(df.userID==person2)&(df.artistID==item)].freq))
    return np.linalg.norm(np.array(freq1)-np.array(freq2))

In [41]:
#sort users by euclidean distance
def most_similar_users(person,number_of_users):
    scores_ = {other_person:similarity_score(person,other_person) for other_person in set(df.userID) if other_person != person}
    scores = sorted(scores_.items(), key=lambda d: d[1],reverse=True) 
    return scores[0:number_of_users]

In [42]:
def user_recommend(person):
    recommend=[]
    #find the top 3 of the most similar user
    similar_users = [i for i, j in most_similar_users(person,3)]
    item1 = set(df[df.userID==person].artistID)
    for other_user in similar_users:
        item2_ = set(df[df.userID==other_user].artistID)
        common_items_ = list(set.intersection(*[item1, item2_]))
        top_ = df[df.userID==other_user].sort_values('freq',ascending=False).artistID.tolist()
        top_2 = filter(lambda x: x not in common_items_ ,top_)
        top_2 = list(set(top_).difference(set(common_items_)))
        recommend.extend(top_2[0:4])
    
    recommends = list(set(recommend))
    #recommendataions_list = [artist_name.get(r) for r in recommends]
    return recommends

In [33]:
usedID = int(input("Enter YOUR userID (chose a number below 2100):"))

print("-"*30)
a_now = artist_now(usedID)
print("Artists you like now:")
print(a_now)

print("-"*30)
recommendation = user_recommend(usedID)
print("We recommend you:")
for i in recommendation:
    print(artist_name.get(i))

Enter YOUR userID (chose a number below 2100):1196
------------------------------
Artists you like now:
['Nirvana', 'Engenheiros do Hawaii', 'Puddle of Mudd', 'Foo Fighters', 'Soundgarden', 'Institute', 'Alice in Chains', 'System of a Down', 'Silverchair', 'Living Colour', 'P.O.D.', 'Bush', 'Paramore', 'Pearl Jam', 'Hoobastank', 'Nullset', 'Nickelback', 'Green Day', 'Alter Bridge', 'Rage Against the Machine', 'Evanescence', 'Red Hot Chili Peppers', 'Limp Bizkit', 'Audioslave', 'Meat Puppets', 'The Pretty Reckless', 'Pouca Vogal', 'Metallica', 'Mudhoney', 'The Offspring', '4 Non Blondes', 'Chevelle', 'Stone Temple Pilots', 'Weezer', 'Humberto Gessinger Trio', 'Capital Inicial', 'Pitty', 'Chris Cornell', 'SoundGarden | www.CdsCompletos.net', 'The Cure']
------------------------------
We recommend you:
Deftones
Ill Niño
Korn
Guano Apes
Marilyn Manson
Slipknot
Bad Religion


# 5  Recommendation System for New User
>We create **a dynamic system**, our information would update if users put in their information.

>It is more accurate to recommend artists based on new users' preference.

In [34]:
userID = input("Enter a new userID (chose a number over 3000):")
userID = int(userID)

Enter a new userID (chose a number over 3000):3002


In [35]:
#a function to get artists preferences from new user
def get_information(count):
    new_data = {} 
    while count>0:
        artistID = input("Chose artists you like:")
        freq = input("How many stars do you want to evaluate this artists(1~5 stars is the highest):")
        artistID = int(artistID)
        freq = float(freq)/5
        count = count -1 
        new_data.update({artistID:freq})
    return new_data

In [36]:
new_data = get_information(3)

Chose artists you like:400
How many stars do you want to evaluate this artists(1~5 stars is the highest):5
Chose artists you like:500
How many stars do you want to evaluate this artists(1~5 stars is the highest):5
Chose artists you like:600
How many stars do you want to evaluate this artists(1~5 stars is the highest):5


In [37]:
#### update our data
newDF = pd.DataFrame()
newDF['userID'] = [i for i in [userID]*3]
newDF['artistID'] = [i for i in new_data.keys()]
newDF['freq'] = [i for i in new_data.values()]
df = df.append(newDF,ignore_index=True)

In [39]:
your_recommend = user_recommend(3002)
print("TOP RECOMMENDATIONS FOR YOU:\n")

for r in your_recommend:
    print(artist_name.get(r))

TOP RECOMMENDATIONS FOR YOU:

Enter Shikari
Jay Vaquer
The Devil Wears Prada
Shemales From outta Space of Death
Low Shoulder
Jessie J
Sara Bareilles
Kerli
Tiffany Thornton
Nicki Minaj
Boys Like Girls


# 6 Evaluate

### Model 1

In [16]:
from sklearn.model_selection import train_test_split
np.random.seed(1234)
train,test = train_test_split(df,test_size=0.25)

In [17]:
m_train = csc_matrix((train.freq,(train.user_ix,train.artist_ix)))
u,s,vt = svds(m_train,k=20)
pred_train = np.dot(np.dot(u,np.diag(s)),vt)

In [18]:
test['pred'] = pred_train[test.user_ix,test.artist_ix]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [19]:
math.sqrt(((test.pred - test.freq)**2).mean())

0.265160565021145