# Computational tools for data science: An Efficient Recommendation System for MillionSongsDataset

In [15]:
# Imports
import numpy as np
import pandas as pd

## Read the data

In [24]:
# Read the songs dataset

from zipfile import ZipFile
from io import BytesIO

with ZipFile('songs_cleaned.zip', 'r') as zip:
    data = zip.read('out.csv')

songs_cleaned = pd.read_csv(BytesIO(data))

In [25]:
# And pre-process the artist_terms feature

def ConvertStringtoList(string):
    s = string.replace('[','')
    s = s.replace(']','')
    s = s.replace('\'','')
    s = s.replace(' ','')
    li = list(s.split(","))
    return li

i = 0
for string in songs_cleaned['artist_terms']:
    list_ = ConvertStringtoList(string)
    songs_cleaned['artist_terms'][i] = list_
    i += 1

In [27]:
songs_cleaned

Unnamed: 0,artist_id,song_id,artist_terms
0,ARMJAGH1187FB546F3,SOCIWDW12A8C13D406,"[blue-eyedsoul, poprock, blues-rock, beachmusi..."
1,ARXR32B1187FB57099,SOFSOCN12A8C143F5D,"[poppunk, skapunk, breakcore, alternativemetal..."
2,AR10USD1187B99F3F1,SOHKNRJ12A6701D1F8,"[post-hardcore, screamo, emo, hardcore, punkre..."
3,ARC43071187B990240,SOKEJEJ12A8C13E0D0,"[ccm, religiousmusic, losangeles, christianroc..."
4,ARL7K851187B99ACD2,SOMUYGI12AB0188633,"[bachata, merengue, reggaeton, latinpop, spani..."
...,...,...,...
3190,ARUUP4L1187B9B72EB,SOILDRV12A8C13EB77,"[nowave, experimentalrock, instrumentalrock, d..."
3191,ARI4S0E1187B9B06C0,SOBUUYV12A58A7DA27,"[celtic, futurejazz, downtempo, easylistening,..."
3192,ARQ91R31187FB38A88,SOUWMIW12AB0184748,"[bluegrass, classiccountry, countrygospel, cou..."
3193,AR4C6V01187FB3BAF4,SOLXXPY12A67ADABA0,"[symphoblackmetal, blackmetal, heavymetal, dea..."


In [43]:
# Read the user tastes' dataset

with ZipFile('users_cleaned.zip', 'r') as zip:
    data = zip.read('out.csv')

users_cleaned = pd.read_csv(BytesIO(data))
users_cleaned

Unnamed: 0,userID,songID,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2
2,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SONQBUB12A6D4F8ED0,2
4,5a905f000fc1ff3df7ca807d57edb608863db05d,SOFKTPP12A8C1385CA,1
...,...,...,...
697059,8305c896f42308824da7d4386f4b9ee584281412,SOIZLKI12A6D4F7B61,1
697060,8305c896f42308824da7d4386f4b9ee584281412,SOQHWMN12A6701E2D9,1
697061,8305c896f42308824da7d4386f4b9ee584281412,SOSQIHH12A8C13370B,1
697062,8305c896f42308824da7d4386f4b9ee584281412,SOUCKDH12A8C138FF5,2


## Content based recommendation

### Represent songs as vectors

We use the feature artist_terms and implement one-hot-encoding. With one-hot-encoding, we convert each categorical value into a new categorical column and assign a binary value 1 or 0 to each feature if the term was initially in the terms of a song.

In [34]:
# First, extract the artist_terms

all_terms = []
for row in range(len(df_songs)):
    if df_songs['artist_terms'][row] == []:
        print('vacia')
    all_terms.append(df_songs['artist_terms'][row])

all_terms = np.concatenate(all_terms)
all_terms = list(set(all_terms)) # we have gotten a list of the set of all artist_terms

d = len(all_terms) # dimension of the vectors we are representing
d

2294

In [39]:
# Now, we intend to create a binary vector (length = d) that represents a song, 
# with 1s if the song has this term and 0s if it hasn't.

def vectorize(song):

    index = int(df_songs.index[df_songs['song_id'] == song][0])
    vector = np.zeros(len(all_terms))

    for i in range(len(vector)):
        if all_terms[i] in df_songs['artist_terms'][index]:
            vector[i] = 1
            
    return vector

In [40]:
# Vectorize ALL the songs and save it into a dictionary

vector_representation = {}

for song in df_songs['song_id']:
    vector_representation[song] = vectorize(song)
    
#vector_representation # we end up with a dictionary of songs with their vector representation

### Get the user profile

In [44]:
user_plays = users_cleaned

In [52]:
# Get the User profile: that will be a vector of dimension d computed as the weighted average of his played songs

#user = '5a905f000fc1ff3df7ca807d57edb608863db05d'
#user = 'b80344d063b5ccb3212f76538f3d9e43d87dca9e'
### user = '00498f4bab2bfeb17680113c7d9525ad5b0ad401'
#user = '3800316358cefa31ef7047a14134b5729654a761'
#user = '5d5e0142e54c3bb7b69f548c2ee55066c90700eb'

song_counts = list(user_plays[user_plays['userID'] == user]['play_count'])
song_list = list(user_plays[user_plays['userID'] == user]['songID'])


# Check if the user songs are in the song dataset 
# and get the indices of the songs
indices = []
for song in song_list:
    if song in list(df_songs['song_id']):
        print()
        print('yes', song, vector_representation[song])
        #print(df_songs.loc[df_songs['song_id']==song]['song_name'])
        print(df_songs.loc[df_songs['song_id']==song]['artist_terms'])
        indices.append(song_list.index(song))
        print(indices)
    else:
        continue


# Get the User profile (i.e. Compute the (weighted) average of the songs of a user)
a = [song_counts[i] for i in indices]
b = [vector_representation[song_list[i]] for i in indices]

numerator = np.zeros(d)
for i in range(len(a)):
    numerator = numerator + ( a[i] * np.asarray(b[i]) )

user_profile = numerator / sum(a)
user_profile


yes SOTNWCI12AAF3B2028 [0. 0. 0. ... 0. 0. 0.]
1261    [country, pop, contemporarycountry, countrypop...
Name: artist_terms, dtype: object
[0]


array([0., 0., 0., ..., 0., 0., 0.])