In [1]:
# to handle  data retrieval
import urllib3
from urllib3 import request
# to handle certificate verification
import certifi
# to manage json data
import json
# for pandas dataframes
import pandas as pd
# uncomment below if installation needed (not necessary in Colab)
#!pip install certifi

## Calling our API to get data

In [2]:
# handle certificate verification and SSL warnings
# https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl
httplistens = urllib3.PoolManager(
       cert_reqs='CERT_REQUIRED',
       ca_certs=certifi.where())

In [3]:
# get data from the API
url = 'https://2ljbg519b0.execute-api.us-east-1.amazonaws.com/dev/user_artists'
r = httplistens.request('GET', url)
r.status

200

In [4]:
# decode json data into a dict object
data = json.loads(r.data.decode('utf-8'))
data

[{'userID': 2, 'artistID': 51, 'weight': 13883},
 {'userID': 2, 'artistID': 52, 'weight': 11690},
 {'userID': 2, 'artistID': 53, 'weight': 11351},
 {'userID': 2, 'artistID': 54, 'weight': 10300},
 {'userID': 2, 'artistID': 55, 'weight': 8983},
 {'userID': 2, 'artistID': 56, 'weight': 6152},
 {'userID': 2, 'artistID': 57, 'weight': 5955},
 {'userID': 2, 'artistID': 58, 'weight': 4616},
 {'userID': 2, 'artistID': 59, 'weight': 4337},
 {'userID': 2, 'artistID': 60, 'weight': 4147},
 {'userID': 2, 'artistID': 61, 'weight': 3923},
 {'userID': 2, 'artistID': 62, 'weight': 3782},
 {'userID': 2, 'artistID': 63, 'weight': 3735},
 {'userID': 2, 'artistID': 64, 'weight': 3644},
 {'userID': 2, 'artistID': 65, 'weight': 3579},
 {'userID': 2, 'artistID': 66, 'weight': 3312},
 {'userID': 2, 'artistID': 67, 'weight': 3301},
 {'userID': 2, 'artistID': 68, 'weight': 2927},
 {'userID': 2, 'artistID': 69, 'weight': 2720},
 {'userID': 2, 'artistID': 70, 'weight': 2686},
 {'userID': 2, 'artistID': 71, 'weig

In [5]:
# in this dataset, the data to extract is under 'features'
listens = pd.json_normalize(data)
listens.head()

Unnamed: 0,userID,artistID,weight
0,2,51,13883
1,2,52,11690
2,2,53,11351
3,2,54,10300
4,2,55,8983


In [6]:
# handle certificate verification and SSL warnings
# https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl
httpartist = urllib3.PoolManager(
       cert_reqs='CERT_REQUIRED',
       ca_certs=certifi.where())

In [7]:
# get data from the API
url = 'https://2ljbg519b0.execute-api.us-east-1.amazonaws.com/dev/artists'
r = httpartist.request('GET', url)
r.status

200

In [8]:
# decode json data into a dict object
artistdata = json.loads(r.data.decode('utf-8'))
artistdata

[{'id': 1,
  'name': 'MALICE MIZER',
  'url': 'http://www.last.fm/music/MALICE+MIZER',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/10808.jpg'},
 {'id': 2,
  'name': 'Diary of Dreams',
  'url': 'http://www.last.fm/music/Diary+of+Dreams',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/3052066.jpg'},
 {'id': 3,
  'name': 'Carpathian Forest',
  'url': 'http://www.last.fm/music/Carpathian+Forest',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/40222717.jpg'},
 {'id': 4,
  'name': 'Moi dix Mois',
  'url': 'http://www.last.fm/music/Moi+dix+Mois',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/54697835.png'},
 {'id': 5,
  'name': 'Bella Morte',
  'url': 'http://www.last.fm/music/Bella+Morte',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/14789013.jpg'},
 {'id': 6,
  'name': 'Moonspell',
  'url': 'http://www.last.fm/music/Moonspell',
  'pictureURL': 'http://userserve-ak.last.fm/serve/252/2181591.jpg'},
 {'id': 7,
  'name': 'Marilyn Manson',
  'url': '

In [9]:
# in this dataset, the data to extract is under 'features'
artistDFall = pd.json_normalize(artistdata)
artistDFall.head()

Unnamed: 0,id,name,url,pictureURL
0,1,MALICE MIZER,http://www.last.fm/music/MALICE+MIZER,http://userserve-ak.last.fm/serve/252/10808.jpg
1,2,Diary of Dreams,http://www.last.fm/music/Diary+of+Dreams,http://userserve-ak.last.fm/serve/252/3052066.jpg
2,3,Carpathian Forest,http://www.last.fm/music/Carpathian+Forest,http://userserve-ak.last.fm/serve/252/40222717...
3,4,Moi dix Mois,http://www.last.fm/music/Moi+dix+Mois,http://userserve-ak.last.fm/serve/252/54697835...
4,5,Bella Morte,http://www.last.fm/music/Bella+Morte,http://userserve-ak.last.fm/serve/252/14789013...


In [10]:
artistDF= artistDFall.drop(columns=['url','pictureURL'])
artistDF.head()

Unnamed: 0,id,name
0,1,MALICE MIZER
1,2,Diary of Dreams
2,3,Carpathian Forest
3,4,Moi dix Mois
4,5,Bella Morte


In [11]:
listens = listens.merge(artistDF, left_on='artistID', right_on='id')
listens = listens.drop(columns='id')
listens = listens.rename(columns = 
                         {'name':'artist',
                          'weight':'user_listens'
                         })
listens.head()

Unnamed: 0,userID,artistID,user_listens,artist
0,2,51,13883,Duran Duran
1,4,51,228,Duran Duran
2,27,51,85,Duran Duran
3,28,51,10,Duran Duran
4,62,51,528,Duran Duran


In [12]:
userTotals = listens.groupby(['userID']).sum()
userTotals = userTotals.drop(columns='artistID')
userTotals = userTotals.rename(columns = {'user_listens':'total_listens'})
userTotals.head()

Unnamed: 0_level_0,total_listens
userID,Unnamed: 1_level_1
2,168737
3,20501
4,26930
5,13159
6,1011


In [13]:
normalListens = listens.merge(userTotals, left_on = 'userID', right_on='userID')
normalListens

Unnamed: 0,userID,artistID,user_listens,artist,total_listens
0,2,51,13883,Duran Duran,168737
1,2,52,11690,Morcheeba,168737
2,2,53,11351,Air,168737
3,2,54,10300,Hooverphonic,168737
4,2,55,8983,Kylie Minogue,168737
...,...,...,...,...,...
92829,1893,17465,1,Matt Lewis Band,4
92830,1893,17466,1,Soul is You,4
92831,1893,17467,1,Bluesoul Pharmacy,4
92832,1893,17468,1,K-Precise,4


## Weighted Listens

In [14]:
normalListens['weighted_listens'] = normalListens['user_listens'] / normalListens['total_listens']
normalListens

Unnamed: 0,userID,artistID,user_listens,artist,total_listens,weighted_listens
0,2,51,13883,Duran Duran,168737,0.082276
1,2,52,11690,Morcheeba,168737,0.069279
2,2,53,11351,Air,168737,0.067270
3,2,54,10300,Hooverphonic,168737,0.061042
4,2,55,8983,Kylie Minogue,168737,0.053237
...,...,...,...,...,...,...
92829,1893,17465,1,Matt Lewis Band,4,0.250000
92830,1893,17466,1,Soul is You,4,0.250000
92831,1893,17467,1,Bluesoul Pharmacy,4,0.250000
92832,1893,17468,1,K-Precise,4,0.250000


In [15]:
mins = normalListens.groupby('userID')['user_listens'].min()
maxs = normalListens.groupby('userID')['user_listens'].max()

In [16]:
normalListens = normalListens.merge(mins, on='userID')
normalListens = normalListens.rename(columns={
    'user_listens_x' : 'user_listens', 
    'user_listens_y' : 'min'})
normalListens

Unnamed: 0,userID,artistID,user_listens,artist,total_listens,weighted_listens,min
0,2,51,13883,Duran Duran,168737,0.082276,1315
1,2,52,11690,Morcheeba,168737,0.069279,1315
2,2,53,11351,Air,168737,0.067270,1315
3,2,54,10300,Hooverphonic,168737,0.061042,1315
4,2,55,8983,Kylie Minogue,168737,0.053237,1315
...,...,...,...,...,...,...,...
92829,1893,17465,1,Matt Lewis Band,4,0.250000,1
92830,1893,17466,1,Soul is You,4,0.250000,1
92831,1893,17467,1,Bluesoul Pharmacy,4,0.250000,1
92832,1893,17468,1,K-Precise,4,0.250000,1


In [17]:
normalListens = normalListens.merge(maxs, on='userID')
normalListens = normalListens.rename(columns={
    'user_listens_x' : 'user_listens', 
    'user_listens_y' : 'max'})

## Scaled Listens

In [18]:
normalListens['scaled'] = (normalListens['user_listens'] - normalListens['min']) / (normalListens['max'] - normalListens['min'])
normalListens.head()

Unnamed: 0,userID,artistID,user_listens,artist,total_listens,weighted_listens,min,max,scaled
0,2,51,13883,Duran Duran,168737,0.082276,1315,13883,1.000000
1,2,52,11690,Morcheeba,168737,0.069279,1315,13883,0.825509
2,2,53,11351,Air,168737,0.067270,1315,13883,0.798536
3,2,54,10300,Hooverphonic,168737,0.061042,1315,13883,0.714911
4,2,55,8983,Kylie Minogue,168737,0.053237,1315,13883,0.610121
...,...,...,...,...,...,...,...,...,...
92829,1893,17465,1,Matt Lewis Band,4,0.250000,1,1,
92830,1893,17466,1,Soul is You,4,0.250000,1,1,
92831,1893,17467,1,Bluesoul Pharmacy,4,0.250000,1,1,
92832,1893,17468,1,K-Precise,4,0.250000,1,1,


## Remove anything scaled to 5% or less

In [19]:
trainingDF = normalListens[normalListens['scaled'] > 0.05]
trainingDF = trainingDF.drop(columns=['user_listens','min','max','total_listens'])
trainingDF.head()

Unnamed: 0,userID,artistID,artist,weighted_listens,scaled
0,2,51,Duran Duran,0.082276,1.000000
1,2,52,Morcheeba,0.069279,0.825509
2,2,53,Air,0.067270,0.798536
3,2,54,Hooverphonic,0.061042,0.714911
4,2,55,Kylie Minogue,0.053237,0.610121
...,...,...,...,...,...
92778,1499,14989,Wayne Kerr,0.016231,0.055202
92779,1499,14990,Dawson Cowals,0.016049,0.054302
92780,1499,14991,Integrity Music,0.015202,0.050125
92823,1266,13563,Tristan Feldbauer,0.638889,1.000000


In [20]:
originalArtists = len(normalListens['artist'].unique())
reducedArtists = len(trainingDF['artist'].unique())

print(f'Reduced artist list from {originalArtists} to {reducedArtists}.')

Reduced artist list from 17631 to 10294.


In [21]:
topArtistsDF = artistDF[artistDF['name'].isin(trainingDF['artist'].unique())]
topArtistsDicts = topArtistsDF.to_dict('records')

In [22]:
userSet = trainingDF.drop(columns=['artist', 'weighted_listens']).sort_values('scaled', ascending=False)
userSet = userSet.to_dict('records')

In [23]:
from collections import defaultdict

userIDs_eachArtist = defaultdict(set)
artistIDs_eachUser = defaultdict(set)


for d in userSet:
    user, artist = d['userID'], d['artistID']
    userIDs_eachArtist[artist].add(user)
    artistIDs_eachUser[user].add(artist)

## Jaccard Index Functions

In [24]:
def Jaccard(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

In [25]:
def recommendDF(iD, n):
    similarArtists, match, request = [],[],[]
    users = userIDs_eachArtist[iD]
    
    for i2 in userIDs_eachArtist:
        if i2 == iD: continue
        sim = Jaccard(users, userIDs_eachArtist[i2])
        similarArtists.append(i2)
        match.append(sim)
        request.append(iD)
        
    
    similarDF = pd.DataFrame({
        'Requested_artist':request,
        'Suggested_artist':similarArtists,
        'Match_strength':match        
    })
    
    return similarDF.sort_values('Match_strength', ascending=False).head(5)

In [30]:
limitedArtists = topArtistsDF['name'].tolist()

## Run Jaccard function for every artist

In [39]:
suggestionDF = pd.DataFrame({
        'Requested_artist':[],
        'Suggested_artist':[],
        'Match_strength':[]        
    })


for artist in range(len(limitedArtists)):
    b = recommendDF(topArtistsDF['id'].tolist()[artist],5)
    suggestionDF = suggestionDF.append(b)

In [46]:
suggestionDF = suggestionDF.reset_index()

In [54]:
suggestionDF = suggestionDF.drop(columns='index')
suggestionDF.head()

Unnamed: 0,Requested_artist,Suggested_artist,Match_strength
0,1.0,5078.0,1.000000
1,1.0,5077.0,1.000000
2,1.0,5076.0,0.333333
3,1.0,390.0,0.200000
4,1.0,1910.0,0.125000
...,...,...,...
51470,18736.0,18735.0,1.000000
51471,18736.0,18734.0,1.000000
51472,18736.0,876.0,0.500000
51473,18736.0,2472.0,0.333333


In [63]:
suggestionDF.to_json('suggestion_json.js', orient = 'records')
