In [1]:
%matplotlib inline

import pandas
from sklearn.cross_validation import train_test_split
import numpy as np
import time
from sklearn.externals import joblib
import Recommenders as Recommenders
import Evaluation as Evaluation



In [2]:
#Read userid-songid-listen_count triplets
#This step might take time to download data from external sources
triplets_file = '10000.txt'
songs_metadata_file = 'song_data.csv'

song_df_1 = pandas.read_table(triplets_file,header=None)
song_df_1.columns = ['user_id', 'song_id', 'listen_count']

#Read song  metadata
song_df_2 =  pandas.read_csv(songs_metadata_file)

#Merge the two dataframes above to create input dataframe for recommender systems
song_df = pandas.merge(song_df_1, song_df_2.drop_duplicates(['song_id']), on="song_id", how="left")

In [3]:
song_df.head()

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOAKIMP12A8C130995,1,The Cove,Thicker Than Water,Jack Johnson,0
1,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBBMDR12A8C13253B,2,Entre Dos Aguas,Flamenco Para Niños,Paco De Lucia,1976
2,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBXHDL12A81C204C0,1,Stronger,Graduation,Kanye West,2007
3,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOBYHAJ12A6701BF1D,1,Constellations,In Between Dreams,Jack Johnson,2005
4,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SODACBL12A8C13C273,1,Learn To Fly,There Is Nothing Left To Lose,Foo Fighters,1999


In [4]:
len(song_df)

2000000

In [5]:
#for small dataset
#song_df = song_df.head(10000)

#Merge song title and artist_name columns to make a merged column
song_df['song'] = song_df['title'].map(str) + " - " + song_df['artist_name']

In [6]:
song_grouped = song_df.groupby(['song']).agg({'listen_count': 'count'}).reset_index()
grouped_sum = song_grouped['listen_count'].sum()
song_grouped['percentage']  = song_grouped['listen_count'].div(grouped_sum)*100
song_grouped.sort_values(['listen_count', 'song'], ascending = [0,1]).head()

Unnamed: 0,song,listen_count,percentage
7127,Sehr kosmisch - Harmonia,8277,0.41385
9084,Undo - Björk,7032,0.3516
2068,Dog Days Are Over (Radio Edit) - Florence + Th...,6949,0.34745
9880,You're The One - Dwight Yoakam,6412,0.3206
6774,Revelry - Kings Of Leon,6145,0.30725


In [7]:
users = song_df['user_id'].unique()
len(users)

76353

In [8]:
songs = song_df['song'].unique()
len(songs)

9953

In [9]:
train_data, test_data = train_test_split(song_df, test_size = 0.20, random_state=0)
train_data.head(5)

Unnamed: 0,user_id,song_id,listen_count,title,release,artist_name,year,song
608812,7b8fbe766a49e5d7618452149dfab920621fc4fb,SOJJYDE12AF729FC16,1,Two Is Better Than One,Love Drunk,Boys Like Girls featuring Taylor Swift,2009,Two Is Better Than One - Boys Like Girls featu...
623729,d24956cd68ff84b6d0271286ae6866ee1c89ff77,SOPQGWI12A8C135DDB,2,Royal Gregory,LP,Holy Fuck,2007,Royal Gregory - Holy Fuck
583106,da7b91b6cab1ca11227ee7720c4d2e03e8c31579,SOCOIIG12A58A7D151,1,Mr Sandman,Original Hits - 50s,The Chordettes,1993,Mr Sandman - The Chordettes
435735,5f633da6ad4845350949c3c76ce6c4ef6f167476,SOQQTBB12AB0182F1D,2,A Days Work (feat. P.O.S),Rádio do Canibal,BK-One,0,A Days Work (feat. P.O.S) - BK-One
1361953,01ad0fabd01af750700a1e80bb0055abcb3edd28,SOVYNVS12AC3DF64AB,2,Rockin' Rollin' Stone,100 Greatest Rockabilly Hits,Andy Starr,2000,Rockin' Rollin' Stone - Andy Starr


In [10]:
pm = Recommenders.popularity_recommender_py()
pm.create(train_data, 'user_id', 'song')
user_id = users[5]
pm.recommend(user_id)

Unnamed: 0,user_id,song,score,Rank
7127,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Sehr kosmisch - Harmonia,6630,1.0
9084,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Undo - Björk,5639,2.0
2068,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Dog Days Are Over (Radio Edit) - Florence + Th...,5592,3.0
9880,4bd88bfb25263a75bbdd467e74018f4ae570e5df,You're The One - Dwight Yoakam,5143,4.0
6774,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Revelry - Kings Of Leon,4938,5.0
7115,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Secrets - OneRepublic,4627,6.0
3613,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Horn Concerto No. 4 in E flat K495: II. Romanc...,4368,7.0
2717,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Fireflies - Charttraxx Karaoke,3835,8.0
3485,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hey_ Soul Sister - Train,3819,9.0
8847,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Tive Sim - Cartola,3707,10.0


In [11]:
is_model = Recommenders.item_similarity_recommender_py()
is_model.create(train_data, 'user_id', 'song')

In [12]:
#Print the songs for the user in training data
user_id = users[5]
user_items = is_model.get_user_items(user_id)
#
print("------------------------------------------------------------------------------------")
print("Training data songs for the user userid: %s:" % user_id)
print("------------------------------------------------------------------------------------")

for user_item in user_items:
    print(user_item)

print("----------------------------------------------------------------------")
print("Recommendation process going on:")
print("----------------------------------------------------------------------")

#Recommend songs for the user using personalized model
is_model.recommend(user_id)

------------------------------------------------------------------------------------
Training data songs for the user userid: 4bd88bfb25263a75bbdd467e74018f4ae570e5df:
------------------------------------------------------------------------------------
The Real Slim Shady - Eminem
Forgive Me - Leona Lewis
Say My Name - Destiny's Child
Speechless - Lady GaGa
Ghosts 'n' Stuff (Original Instrumental Mix) - Deadmau5
Missing You - John Waite
Without Me - Eminem
Somebody To Love - Justin Bieber
Just Lose It - Eminem
----------------------------------------------------------------------
Recommendation process going on:
----------------------------------------------------------------------
No. of unique songs for the user: 9
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :60670


Unnamed: 0,user_id,song,score,rank
0,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Mockingbird - Eminem,0.057775,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,My Name Is - Eminem,0.056529,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,U Smile - Justin Bieber,0.045514,3
3,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Terre Promise - O'Rosko Raricim,0.044706,4
4,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Eenie Meenie - Sean Kingston and Justin Bieber,0.043548,5
5,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Superman - Eminem / Dina Rae,0.04289,6
6,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Hailie's Song - Eminem,0.04137,7
7,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Love Me - Justin Bieber,0.041012,8
8,4bd88bfb25263a75bbdd467e74018f4ae570e5df,Drop The World - Lil Wayne / Eminem,0.040764,9
9,4bd88bfb25263a75bbdd467e74018f4ae570e5df,OMG - Usher featuring will.i.am,0.039818,10


In [15]:
is_model.get_similar_items(['U Smile - Justin Bieber'])

no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :7604


Unnamed: 0,user_id,song,score,rank
0,,Love Me - Justin Bieber,0.213793,1
1,,Somebody To Love - Justin Bieber,0.206537,2
2,,Eenie Meenie - Sean Kingston and Justin Bieber,0.203704,3
3,,One Less Lonely Girl - Justin Bieber,0.177065,4
4,,One Time - Justin Bieber,0.170461,5
5,,That Should Be Me - Justin Bieber,0.166469,6
6,,Stuck In The Moment - Justin Bieber,0.161861,7
7,,Down To Earth - Justin Bieber,0.159392,8
8,,Runaway Love - Justin Bieber,0.137355,9
9,,Up - Justin Bieber,0.135096,10


In [None]:
start = time.time()

#Define what percentage of users to use for precision recall calculation
user_sample = 0.005

#Instantiate the precision_recall_calculator class
pr = Evaluation.precision_recall_calculator(test_data, train_data, pm, is_model)

#Call method to calculate precision and recall values
(pm_avg_precision_list, pm_avg_recall_list, ism_avg_precision_list, ism_avg_recall_list) = pr.calculate_measures(user_sample)

end = time.time()
print(end - start)

Length of user_test_and_training:68327
Length of user sample:341
Getting recommendations for user:884f606371c3f88b19b0bc04475290a2d9221d73
No. of unique songs for the user: 17
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :87896
Getting recommendations for user:64c63b761dfe4677a0349287ba49eee9046fdff4
No. of unique songs for the user: 6
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :34372
Getting recommendations for user:31fbcfb45d1543d5a806156698ebef71058cb0fa
No. of unique songs for the user: 12
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :85191
Getting recommendations for user:4a0fa91ab53d0cfde6e5da4e197bce0272e5bb87
No. of unique songs for the user: 12
no. of unique songs in the training set: 9953
Non zero values in cooccurence_matrix :60624
Getting recommendations for user:82105117087e3979c13f0ec25be3ea8656f0d8cc
No. of unique songs for the user: 14
no. of unique son

In [None]:
import pylab as pl

#Method to generate precision and recall curve
def plot_precision_recall(m1_precision_list, m1_recall_list, m1_label, m2_precision_list, m2_recall_list, m2_label):
    pl.clf()    
    pl.plot(m1_recall_list, m1_precision_list, label=m1_label)
    pl.plot(m2_recall_list, m2_precision_list, label=m2_label)
    pl.xlabel('Recall')
    pl.ylabel('Precision')
    pl.ylim([0.0, 0.20])
    pl.xlim([0.0, 0.20])
    pl.title('Precision-Recall curve')
    #pl.legend(loc="upper right")
    pl.legend(loc=9, bbox_to_anchor=(0.5, -0.2))
    pl.show()

In [None]:
print("Plotting precision recall curves.")

plot_precision_recall(pm_avg_precision_list, pm_avg_recall_list, "popularity_model",
                      ism_avg_precision_list, ism_avg_recall_list, "item_similarity_model")