#Building a song recommender


#Fire up GraphLab Create

In [2]:
import graphlab

#Load music data

In [3]:
song_data = graphlab.SFrame('song_data.gl/')

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1555426223.log


This non-commercial license of GraphLab Create for academic use is assigned to nquangd@gmail.com and will expire on April 11, 2020.


#Explore data

Music data shows how many times a user listened to a song, as well as the details of the song.

In [4]:
song_data.head()

user_id,song_id,listen_count,title,artist
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOAKIMP12A8C130995,1,The Cove,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Paco De Lucia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBXHDL12A81C204C0,1,Stronger,Kanye West
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOBYHAJ12A6701BF1D,1,Constellations,Jack Johnson
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODACBL12A8C13C273,1,Learn To Fly,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODDNQT12A6D4F5F7E,5,Apuesta Por El Rock 'N' Roll ...,Héroes del Silencio
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SODXRTY12AB0180F3B,1,Paper Gangsta,Lady GaGa
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFGUAY12AB017B0A8,1,Stacked Actors,Foo Fighters
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOFRQTD12A81C233C0,1,Sehr kosmisch,Harmonia
b80344d063b5ccb3212f76538 f3d9e43d87dca9e ...,SOHQWYZ12A6D4FA701,1,Heaven's gonna burn your eyes ...,Thievery Corporation feat. Emiliana Torrini ...

song
The Cove - Jack Johnson
Entre Dos Aguas - Paco De Lucia ...
Stronger - Kanye West
Constellations - Jack Johnson ...
Learn To Fly - Foo Fighters ...
Apuesta Por El Rock 'N' Roll - Héroes del ...
Paper Gangsta - Lady GaGa
Stacked Actors - Foo Fighters ...
Sehr kosmisch - Harmonia
Heaven's gonna burn your eyes - Thievery ...


##Showing the most popular songs in the dataset

In [None]:
graphlab.canvas.set_target('ipynb')

In [None]:
song_data['song'].show()

In [None]:
len(song_data)

##Count number of unique users in the dataset

In [None]:
users = song_data['user_id'].unique()

In [None]:
len(users)

#Create a song recommender

In [15]:
train_data,test_data = song_data.random_split(.8,seed=0)

##Simple popularity-based recommender

In [None]:
popularity_model = graphlab.popularity_recommender.create(train_data,
                                                         user_id='user_id',
                                                         item_id='song')

###Use the popularity model to make some predictions

A popularity model makes the same prediction for all users, so provides no personalization.

In [None]:
popularity_model.recommend(users=[users[0]])

In [None]:
popularity_model.recommend(users=[users[1]])

##Build a song recommender with personalization

We now create a model that allows us to make personalized recommendations to each user. 

In [16]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
                                                                user_id='user_id',
                                                                item_id='song')

###Applying the personalized model to make song recommendations

As you can see, different users get different recommendations now.

In [None]:
personalized_model.recommend(users=[users[0]])

In [None]:
personalized_model.recommend(users=[users[1]])

###We can also apply the model to find similar songs to any song in the dataset

In [None]:
personalized_model.get_similar_items(['With Or Without You - U2'])

In [None]:
personalized_model.get_similar_items(['Chan Chan (Live) - Buena Vista Social Club'])

#Quantitative comparison between the models

We now formally compare the popularity and the personalized models using precision-recall curves. 

In [None]:
if graphlab.version[:3] >= "1.6":
    model_performance = graphlab.compare(test_data, [popularity_model, personalized_model], user_sample=0.05)
    graphlab.show_comparison(model_performance,[popularity_model, personalized_model])
else:
    %matplotlib inline
    model_performance = graphlab.recommender.util.compare_models(test_data, [popularity_model, personalized_model], user_sample=.05)

The curve shows that the personalized model provides much better performance. 

In [6]:
len(song_data[song_data['artist'] == 'Kanye West']['user_id'].unique())

2522

In [7]:
len(song_data[song_data['artist'] == 'Foo Fighters']['user_id'].unique())

2055

In [8]:
len(song_data[song_data['artist'] == 'Taylor Swift']['user_id'].unique())

3246

In [9]:
len(song_data[song_data['artist'] == 'Lady GaGa']['user_id'].unique())

2928

In [10]:
agg = song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')})

In [12]:
agg.sort('total_count')

artist,total_count
William Tabbert,14
Reel Feelings,24
Beyoncé feat. Bun B and Slim Thug ...,26
Diplo,30
Boggle Karaoke,30
harvey summers,31
Nâdiya,36
Kanye West / Talib Kweli / Q-Tip / Common / ...,38
Aneta Langerova,38
Jody Bernal,38


In [13]:
agg.sort('total_count', ascending=False)

artist,total_count
Kings Of Leon,43218
Dwight Yoakam,40619
Björk,38889
Coldplay,35362
Florence + The Machine,33387
Justin Bieber,29715
Alliance Ethnik,26689
OneRepublic,25754
Train,25402
The Black Keys,22184


In [17]:
subset_test_users = test_data['user_id'].unique()[0:10000]

In [18]:
recommend = personalized_model.recommend(subset_test_users,k=1)

In [19]:
recommend.head()

user_id,song,score,rank
c66c10a9567f0d82ff31441a9 fd5063e5cd9dfe8 ...,Cuando Pase El Temblor - Soda Stereo ...,0.0194504536115,1
c067c22072a17d33310d7223d 7b79f819e48cf42 ...,Grind With Me (Explicit Version) - Pretty Ricky ...,0.0459424376488,1
f6c596a519698c97f1591ad89 f540d76f6a04f1a ...,Hey_ Soul Sister - Train,0.0238929539919,1
696787172dd3f5169dc94deef 97e427cee86147d ...,Senza Una Donna (Without A Woman) - Zucchero / ...,0.017026577677,1
3a7111f4cdf3c5a85fd4053e3 cc2333562e1e0cb ...,Heartbreak Warfare - John Mayer ...,0.0298416515191,1
532e98155cbfd1e1a474a28ed 96e59e50f7c5baf ...,Jive Talkin' (Album Version) - Bee Gees ...,0.0118288653237,1
ee43b175ed753b2e2bce806c9 03d4661ad351a91 ...,Ricordati Di Noi - Valerio Scanu ...,0.0305171211561,1
e372c27f6cb071518ae500589 ae02c126954c148 ...,Fall Out - The Police,0.0819672048092,1
83b1428917b47a6b130ed471b 09033820be78a8c ...,Clocks - Coldplay,0.042858839035,1
39487deef9345b1e22881245c abf4e7c53b6cf6e ...,Black Mirror - Arcade Fire ...,0.0417737685717,1


In [20]:
most = recommend.groupby(key_columns='song',operations={'count':graphlab.aggregate.COUNT()})

In [21]:
most.head()

song,count
The Climb - Miley Cyrus,3
Hey Daddy (Daddy's Home) - Usher ...,5
I Gotta Feeling - Black Eyed Peas ...,16
Cerdo - Molotov,1
Ich Will - Rammstein,9
Too Deep - Girl Talk,3
Dumpweed - Blink-182,2
Guys Like Me - Eric Church ...,2
I Can't Love You Back - Easton Corbin ...,2
Freedom - Akon,4


In [23]:
most.sort('count',ascending=False)

song,count
Undo - Björk,430
Secrets - OneRepublic,385
Revelry - Kings Of Leon,228
You're The One - Dwight Yoakam ...,163
Fireflies - Charttraxx Karaoke ...,121
Sehr kosmisch - Harmonia,97
Horn Concerto No. 4 in E flat K495: II. Romance ...,96
Hey_ Soul Sister - Train,90
OMG - Usher featuring will.i.am ...,64
Dog Days Are Over (Radio Edit) - Florence + The ...,45
