In [15]:
from datetime import datetime  
import pandas as pd
import numpy as np
import random
from pprint import pprint
import pickle

## Pre-Processing the data files

Here we convert the triplets data into dictionaries

In [2]:
main_file = 'D:\\Study\\MDM\\Project\\Reference\\Collaborative-Filtering-Million-Song-Dataset\\xaa.txt'
header_list = ["listenerID", "songID", "count"]
df = pd.read_csv(main_file, sep='\t', names=header_list)
df_2 = pd.read_csv('D:\\Study\\MDM\\Project\\Reference\\Collaborative-Filtering-Million-Song-Dataset\\test_triplets_first_half.txt', sep='\t', names=header_list)
df = pd.concat([df, df_2], ignore_index= True)
# Grouping and getting sets of songs for each user along with total number of plays
df_group_listeners_count = df.groupby('listenerID')['songID'].agg(size=len, set=lambda x: set(x))
# Grouping and getting lists of songs for each user along with total number of plays
df_group_listeners = df.groupby('listenerID')[['songID', 'count']].apply(lambda g: g.values.tolist())

In [3]:
# This method will be used to create dictionaries from list
def get_dict_from_list(temp):
    songs_dict = {}
    count = 0
    for item in temp:
        count += item[1]
    for item in temp:
        songs_dict[item[0]] = item[1] / count
    return songs_dict

In [4]:
# listenersSongs is a dictionary of user_id as keys and dictionaries with song_ID keys and percentage played as values
listenersSongs = df_group_listeners.apply(lambda row: get_dict_from_list(row)).to_dict()
# This will have total songs played by each user
listenersTotalPlays = df_group_listeners_count.to_dict()['size']

In [5]:
# This will have listener IDs to be tested
test_listeners = set(
    pd.read_csv('D:\\Study\\MDM\\Project\\Reference\\Collaborative-Filtering-Million-Song-Dataset\\test_triplets_first_half.txt', sep='\t', names=header_list)
    ['listenerID'].unique())

Have a look at a random sample from the pre-processed data

In [6]:
rand1 = random.choice(list(listenersSongs.keys()))
pprint([rand1, listenersSongs[rand1]])
rand2 = random.choice(list(listenersTotalPlays.keys()))
pprint([rand2, listenersTotalPlays[random.choice(list(listenersTotalPlays.keys()))]])
random.sample(test_listeners, 10)

['4a1da9245202e5769d9504ebd3cb7d5f2faa9d51',
 {'SOADJQJ12A8C141D38': 0.04113924050632911,
  'SOAERUU12A8C13F302': 0.00949367088607595,
  'SOAWMBL12AF72A8EE3': 0.0031645569620253164,
  'SOAXGDH12A8C13F8A1': 0.0031645569620253164,
  'SOBAHRH12A8C1367B1': 0.0031645569620253164,
  'SOCJHPS12A6D4F8523': 0.00949367088607595,
  'SOCKSGZ12A58A7CA4B': 0.00949367088607595,
  'SOCWAVM12A8C1320D5': 0.00949367088607595,
  'SOCYYYE12A67020E65': 0.0031645569620253164,
  'SODJWHY12A8C142CCE': 0.0031645569620253164,
  'SOEKSGJ12A67AE227E': 0.00949367088607595,
  'SOELHGL12AB017DB4B': 0.022151898734177215,
  'SOELHVX12AB017DB78': 0.00949367088607595,
  'SOEPGZU12A6D4F91E1': 0.0031645569620253164,
  'SOEYWYP12A6D4F5E9D': 0.0031645569620253164,
  'SOFKABN12A8AE476C6': 0.0031645569620253164,
  'SOFRQTD12A81C233C0': 0.0379746835443038,
  'SOFTVGI12AB017DB6D': 0.00949367088607595,
  'SOGAROW12A6D4FBB03': 0.0031645569620253164,
  'SOGQJKF12A8C13729E': 0.0031645569620253164,
  'SOGTOGA12A6D4F953D': 0.012658227

['011ed4402dae54389b9f1f24001b975eaa87d494',
 '00aab92d3ff5ccc7da850520824ce01039f2ef17',
 '004a8a7e06ed67e6bbf11377cefbad127f5891c6',
 '0061902fe653fd807da2edf99cc7a2c5c55cc74c',
 '0054bb08f34f416c9688600b5318efc1fb4158ed',
 '00e36dbf522aecb9a10556a248524175dbd02475',
 '0126bc66b79a8c0c499cab9ff3f96ac7b63d7395',
 '011abe23d43bedd355041dddd994b86ebdb83675',
 '00a92fddbc755f66b6650720b29960c4f4161e81',
 '00e62a101d89b3c782898e536f17869715a9ca3e']

### Functions to generate similarity and scoring

In [6]:
# This function is to calculate score from two song percentages
def calc_common_percent(perc1, perc2):
    smaller = min(perc1, perc2)
    larger = max(perc1, perc2)
    return smaller + ((smaller / larger) * (larger - smaller))

In [7]:
# For two users, for all the common songs, this function will be used to generate two scores
# One is percentage of common songs and other is percentage of count of songs played in common
def generate_similarity_measure(first_dict, second_dict, user_1_total_plays, user_2_total_plays, common_songs):
    perc_common = 0.0
    perc_count_common = 0.0

    for song in common_songs:
        user_1_play_count = first_dict[song]
        user_2_play_count = second_dict[song]

        perc_common += calc_common_percent(user_1_play_count, user_2_play_count)

        p1 = user_1_play_count * user_1_total_plays
        p2 = user_2_play_count * user_2_total_plays
        perc_count_common += calc_common_percent(p1, p2)

    return [perc_common, perc_count_common]

Below is the main function used to generate recommendations

In [8]:
# Takes pre processed data and user_ids to be tested as input
def recommend_by_similarity(test_user_ids, user_total_plays_map, user_song_percentage_map, limiting_factor, power=1):
    recommendation_set = {}
    count = 0
    tot_test_ids = len(test_user_ids)

    for user_1 in test_user_ids:
        similarity_score_map = {}

        count += 1
        if count % 30 == 1 :
            print("Test listener ", count, " of ", tot_test_ids)

        for user_2, user_2_songs_perc_map in user_song_percentage_map.items():

            if user_1 != user_2:
#                 Get common songs between current user and every other user
                common_songs = {k for k in user_song_percentage_map[user_1].keys() if k in user_2_songs_perc_map}
#                  If there are any common songs, using the song measure and other stats, generate a score for this user
                if len(common_songs) != 0:
                    similarity_score_map[user_2] = generate_similarity_measure(user_song_percentage_map[user_1],
                                                                               user_2_songs_perc_map,
                                                                               user_total_plays_map[user_1],
                                                                               user_total_plays_map[user_2],
                                                                               common_songs)

        rescaling_factors = []
        for i in range(2):
            max_val = max([v[i] for v in similarity_score_map.values()])
            factor = 1.0 / max_val if max_val != 0 else 0.0
            rescaling_factors.append(factor)
            
#         Summing all the percentages for all similar songs, get a single score for all users

        similarity_score_map = {k: sum(np.array(v) * np.array(rescaling_factors)) ** power for k, v in
                                similarity_score_map.items()}
        numRecsRequired = len(user_song_percentage_map[user_1])

        maximum_similarity_score = max(similarity_score_map.values())

        recommendations = {}

        if count % 30 == 1 :
            print("Recommending for listener, ", count)

#         Sorting by highest scored user first, make recommendations
        for user_2, similarity in sorted(similarity_score_map.items(), key=lambda x: x[1], reverse=True):
            if len(recommendations) >= numRecsRequired and similarity < limiting_factor * maximum_similarity_score:
                break
#             Get songs which curr user have not listened but the other one has. Add those to recommendations
            user1_not_listened_songs = {key: val for key, val in user_song_percentage_map[user_2].items() if
                                        key not in user_song_percentage_map[user_1]}
            for song_id, perc_played in user1_not_listened_songs.items():
                if song_id not in recommendations:
                    recommendations[song_id] = perc_played * similarity
                else:
                    recommendations[song_id] += perc_played * similarity
        recommendation_set[user_1] = [k for k in sorted(recommendations, key=recommendations.get, reverse=True)]
    print('Finished recommending')
    return recommendation_set

In [11]:
# Find matched songs and get score. 0 if none
def check_hit_count_score(actual_listened_songs, predicted_songs, threshold=10):
    if len(predicted_songs) > threshold:
        predicted_songs = predicted_songs[:threshold]
    hit_score = 0.0
    number_of_matched_songs = 0.0    
    for index, song in enumerate(predicted_songs):
        if song in actual_listened_songs and song not in predicted_songs[:index]:
            number_of_matched_songs += 1.0
            hit_score += number_of_matched_songs / (index + 1.0)
    if not actual_listened_songs:
        return 0.0
    return hit_score / min(len(actual_listened_songs), threshold)

In [10]:
# Get average score
def get_average_recommendation_score(actual_list, predicted_list, threshold=10):
    return np.mean([check_hit_count_score(actual_song, predicted_song, threshold) for actual_song, predicted_song in
                    zip(actual_list, predicted_list)])

In [9]:
# Convert dictionaries to lists to conveniently genearte scores
def calculate_whole_mean_score(recommended_songs_map, actual_songs_map):
    recommended_songs_list = []
    actual_songs_list = []
    for user_id, actual_songs in actual_songs_map.items():
        actual_songs_list.append(actual_songs)
        recommended_songs_list.append(recommended_songs_map[user_id])
    return get_average_recommendation_score(actual_songs_list, recommended_songs_list, 500)


### Performing analysis and on the test set

Test set is taken separately and split into two halves. One half of the data will be used to make predictions and the other half will be used to verify if the recommendations match the actual listened songs or not.

One file is test_triplets_first_half.txt and other is test_triplets_second_half.txt

In [12]:
# This list will have all the user_ids to be tested. 
#Songs these users listened(first half) will already be there in the training dictionaries
test_listeners = set(
    pd.read_csv('D:\\Study\\MDM\\Project\\Reference\\Collaborative-Filtering-Million-Song-Dataset\\test_triplets_first_half.txt', sep='\t', names=header_list)['listenerID'].unique())

In [13]:
# Calling the main method to generate recommendations
listenersRecs = recommend_by_similarity(test_listeners, listenersTotalPlays, listenersSongs, .4)

Test listener  1  of  526
Recommending for listener,  1
Test listener  31  of  526
Recommending for listener,  31
Test listener  61  of  526
Recommending for listener,  61
Test listener  91  of  526
Recommending for listener,  91
Test listener  121  of  526
Recommending for listener,  121
Test listener  151  of  526
Recommending for listener,  151
Test listener  181  of  526
Recommending for listener,  181
Test listener  211  of  526
Recommending for listener,  211
Test listener  241  of  526
Recommending for listener,  241
Test listener  271  of  526
Recommending for listener,  271
Test listener  301  of  526
Recommending for listener,  301
Test listener  331  of  526
Recommending for listener,  331
Test listener  361  of  526
Recommending for listener,  361
Test listener  391  of  526
Recommending for listener,  391
Test listener  421  of  526
Recommending for listener,  421
Test listener  451  of  526
Recommending for listener,  451
Test listener  481  of  526
Recommending for liste

In [7]:
# Here we generate the actual songs the user has listened to. The second half of the test set
header_list = ["listenerID", "songID", "count"]
df_answers = pd.read_csv('test_triplets_second_half.txt', sep='\t', names=header_list)
actual_listened_songs_map = df_answers.groupby('listenerID')['songID'].agg(lambda x: set(x)).to_dict()

In [14]:
# Use the formulae to calculate recommendation score
print(calculate_whole_mean_score(listenersRecs, actual_listened_songs_map))

0.10311862633478829


In [None]:
pickle.dump(open("listenersRecs.pkl", "wb"))

Score is around 10-12 for this dataset. This is almost 5 times better than recommending randomly or recommending popular songs. Still, as this is a basic type of recommendation system, accuracy can be considered low. ALS methods, which will be done sepaately will have better accuracy than this method

### Getting Song names from recommendations

In [2]:
listenersRecs = pickle.load(open("listenersRecs.pkl", "rb"))

In [68]:
rand2 = random.choice(list(listenersRecs.keys()))
rand2
# rand2 = '00ab1b0140bbc682342526b017be0f14cca42653'

'01146c36fd0738038720f8362cbe71e79999ca25'

In [63]:
df_answers = pd.read_csv('test_triplets_second_half.txt', sep='\t', names=header_list)
answers = df_answers.loc[df_answers['listenerID'] == rand2]['songID']

In [64]:
suggested_songs = listenersRecs[rand2][:10]

In [65]:
df_meta = pd.read_csv('D:\\Study\\MDM\\Project\\DataSet\\DataSet\\track_metadata.csv', error_bad_lines = False)

#### Actual listened songs

In [66]:
df_meta.loc[df_meta['song_id'].isin(answers)][['title', 'artist_name', 'year']]

Unnamed: 0,title,artist_name,year
316198,Midlife Crisis,Faith No More,1992
320960,Not Me,Datarock,2007
440898,Myth Takes,!!!,2007
534342,Last Cup Of Sorrow,Faith No More,1997
621186,Bodhisattva,Steely Dan,1973
716826,Wicked Garden (LP Version),Stone Temple Pilots,2003


#### Recommendations

In [67]:
df_meta.loc[df_meta['song_id'].isin(suggested_songs)][['title', 'artist_name', 'year']]

Unnamed: 0,title,artist_name,year
5998,The Maestro,Beastie Boys,1992
195655,Représente,Alliance Ethnik,1999
299147,Jamaica Roots II(Agora E Sempre),Natiruts,0
550400,Glamour y Violencia,Once Tiros,2005
669775,Better Man,Pearl Jam,1994
763712,Eleanor Put Your Boots On,Franz Ferdinand,2005
804312,Kids In America,Bloodhound Gang,1995
879955,3 Rounds and a Sound,Blind Pilot,2008
907597,Eleanor Put Your Boots On,Franz Ferdinand,2005
933652,Forever,Drake / Kanye West / Lil Wayne / Eminem,0
