In [3]:
from numpy import array
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np
% matplotlib inline

from pyspark import SparkContext
from pyspark.mllib.clustering import KMeansModel
from pyspark.mllib.recommendation import MatrixFactorizationModel

In [4]:
sc = SparkContext()

### Import the data

In [25]:
data = sc.textFile('./lastfm-dataset-360k-small/merged-subset2.csv')
#Clean the data into proper RDD
data = data.map(lambda x: x.split('\t'))
header = data.first()

data = data.filter(lambda line: line!=header)
data = data.filter(lambda x: (len(x[4]) == 1) & (len(x[5]) > 1))

#Define dictionaries and functions to encode the data
genderdict = {'m':0,'f':1}

countrylist = ['United States', 'Germany', 'United Kingdom', 'Poland', 'Russian Federation', 'Brazil', 'Sweden', 'Spain', 'Finland', 'Netherlands', 'Italy', 'France', 'Canada', 'Australia', 'Turkey', 'Norway', 'Czech Republic', 'Ukraine', 'Japan', 'Belgium', 'Mexico', 'Argentina', 'Switzerland', 'Austria', 'Romania', 'Portugal', 'Bulgaria', 'Chile', 'Denmark', 'Colombia', 'Greece', 'Hungary', 'Latvia', 'Slovakia', 'Croatia', 'Serbia', 'Lithuania', 'Estonia', 'Ireland', 'New Zealand', 'Belarus', 'Israel', 'India', 'Venezuela', 'Indonesia', 'Singapore', 'Slovenia', 'Korea, Republic of', 'China', 'South Africa', 'Malaysia', 'Philippines', 'Peru', 'Thailand', 'Moldova', 'Costa Rica', 'Iceland', 'Taiwan', 'Paraguay', 'Bosnia and Herzegovina', 'Antarctica', 'Puerto Rico', 'Georgia', 'Macedonia', 'Uruguay', 'Honduras', 'Barbados', 'Kazakhstan', 'Andorra', 'Saudi Arabia', 'United States Minor Outlying Islands', 'Djibouti', 'Cocos (Keeling) Islands', 'Tunisia', 'Egypt', 'Bolivia', 'Panama', 'Brunei Darussalam', 'Iran, Islamic Republic of', 'Dominican Republic', 'El Salvador', 'Haiti', 'Ecuador', 'Guatemala', 'Morocco', 'Pakistan', 'Burkina Faso', 'Azerbaijan', 'Cambodia', 'Hong Kong', 'Viet Nam', 'United Arab Emirates', 'Jamaica', 'Faroe Islands', 'Somalia', 'Guinea-Bissau', 'Micronesia, Federated States of', 'Tuvalu', "Cote D'Ivoire", 'Libyan Arab Jamahiriya', 'Nicaragua', 'Kyrgyzstan', 'Malta', 'Bermuda', 'Luxembourg', 'Kuwait', 'Cyprus', 'Heard Island and Mcdonald Islands', 'Christmas Island', 'Cuba', 'Niue', 'Aruba', 'Vanuatu', 'Dominica', 'Holy See (Vatican City State)', 'Uzbekistan', 'Bhutan', 'Montenegro', 'Reunion', 'Fiji', 'Netherlands Antilles', 'Lebanon', 'Liechtenstein']
countrydict = dict(zip(countrylist, [i for i in range(len(countrylist))]))
# print(countrydict)

# Define a mapper functions
def mapr1(key):
    """ Maps numbers to categories (gender)"""
    return genderdict[key]

def mapr2(key):
    """ Maps numbers to categories (country)"""
    return countrydict[key]

### Loading the models

In [8]:
kmeans = KMeansModel.load(sc, "./model/clustering1")

als = MatrixFactorizationModel.load(sc, "./model/als")

## Determine top 10 from Clustering

In [26]:
# Identify and assign clusters
data2 = data.map(lambda x: [x, kmeans.predict([mapr1(x[4]), float(x[5]), mapr2(x[6])])])

def cluster_extractor(user_id):
    """ This function returns cluster number for a user"""
    
    return data2.filter(lambda x: x[0][0] == user_id).map(lambda x: x[1]).first()

In [27]:
#Input user_id
user_id = '03b9c645bc4f578ea1dcb6a975e7ea71fab79da2'

cluster_no = cluster_extractor(user_id) #Extract cluster number

# Extract top artistIds corresponding to highest scores
average = data2.filter(lambda x: x[1] == cluster_no).map(lambda x: (x[0][1], float(x[0][3]))).reduceByKey(lambda x,y : x+y)
count = data2.filter(lambda x: x[1] == cluster_no).map(lambda x: (x[0][1], 1)).reduceByKey(lambda x,y : x+y)
average = average.join(count).map(lambda x: (x[0], x[1][0]/x[1][1]))

#Top 20 Artists for given user
top20 = average.map(lambda x: (x[1],x[0])).sortByKey(0, 1).map(lambda x: (x[1],x[0])).take(20)

In [28]:
for num, line in enumerate(top20):
    print (str(num+1) + ")", line[0], "(" + str(line[1]) + ")")

1) 8613cfbc-0914-49ec-b86a-cd22a7639b3c (8.768539325842697)
2) 258797f8-89ee-4d5e-9fd6-7aaac97341cf (8.732962447844228)
3) ecf9ac46-dbb5-424c-87a5-be2fce8661c8 (5.963657678780773)
4) c80f38a6-9980-485d-997c-5c1a9cbd0d64 (3.117363344051447)
5) 95c8b6b7-1e24-4987-804c-024f6b05cd91 (2.7027863777089784)
6) b58a2815-9c15-4fc8-b023-9bea7207e877 (2.3704280155642024)
7) 22fa6038-d14c-4aab-a057-d397132e9191 (2.1096045197740114)
8) a0b2f210-cd3a-453d-937d-e4f2658d17c7 (2.0754189944134076)
9) 61ed9c9c-79eb-4e8f-8015-bd599ac0ab49 (2.0582959641255605)
10) ac15222f-fb8a-4d2b-b4da-bde1c19f0a9f (1.9716312056737588)
11) 8a0e10a9-80c7-40e0-bf5b-89f1d57f9537 (1.7780713342140027)
12) 2fddb92d-24b2-46a5-bf28-3aed46f4684c (1.7687237866986218)
13) 149e6720-4e4a-41a4-afca-6d29083fc091 (1.6735496558505407)
14) 214ffc4e-0b3c-4be1-970f-90893365f3b4 (1.4220921726408193)
15) 80b3cf5e-18fe-4c59-98c7-e5bb87210710 (1.407128169979438)
16) 5dd0da33-5161-4378-8fca-aecf5840097e (1.3738140417457305)
17) 4f005856-fa1e-4525

## Determine top 10 from ALS

In [31]:
users = data.map(lambda x: x[0]).distinct().zipWithIndex() #Get Index for every user
artists = data2.map(lambda x: x[1]).distinct().zipWithIndex() #Get Index for every artist
userIndex = users.filter(lambda x: x[0] == user_id).map(lambda x: x[1]).first() #Extract User Index for our user

In [40]:
# Make and print recommendations

top20 = als.recommendProducts(userIndex,20)
top20[:]

[Rating(user=3747, product=1334, rating=2771.6630891955137),
 Rating(user=3747, product=1938, rating=2222.6612352582015),
 Rating(user=3747, product=4184, rating=2209.5433625711016),
 Rating(user=3747, product=822, rating=1880.537098058055),
 Rating(user=3747, product=3803, rating=1376.646465824883),
 Rating(user=3747, product=2487, rating=1328.1250885405311),
 Rating(user=3747, product=4861, rating=1214.666309541602),
 Rating(user=3747, product=573, rating=1182.4156547110183),
 Rating(user=3747, product=3037, rating=1053.8261828692325),
 Rating(user=3747, product=1707, rating=1025.907616879555),
 Rating(user=3747, product=1784, rating=982.2686873411668),
 Rating(user=3747, product=5214, rating=979.2264717714163),
 Rating(user=3747, product=2812, rating=964.7943759376794),
 Rating(user=3747, product=1780, rating=932.0814450548323),
 Rating(user=3747, product=632, rating=809.8830378330626),
 Rating(user=3747, product=3342, rating=781.5913862855366),
 Rating(user=3747, product=4461, rati