In [1]:
import math
import re
import numpy as np
import itertools
import datetime

from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [2]:
sc = SparkContext()

In [7]:
#Select the desired columns
data = sc.textFile('./lastfm-dataset-360k-small/merged-subset2.csv')
# header = data.take(1)
# data.take(5)

In [8]:
data = data.map(lambda x: x.split('\t'))
header = data.first()
print(header)

['userId', 'artistId', 'artist', 'plays', 'gender', 'age', 'country', 'signupDate']


In [5]:
# data.take(1)
# data.filter(lambda x : x != header).take(1)

In [9]:
#Remove header
data2 = data.filter(lambda line: line!=header)
data2 = data2.map(lambda x : [x[i] for i in [0,1,3]])
print ("length of uncleaned data -",data2.count())
# data2.map(lambda x : len(x[1])).collect()
# data2 = data2.filter(lambda x : len(x[1]) == 36) #Clean data - remove artists without artistId

def isNumber(s):
    try:
        float(s)
        return True
    except ValueError:
        pass
 
    return False

data2 = data2.filter(lambda x: isNumber(x[2])) # Remove faulty rows
data2 = data2.map(lambda x: [x[0], x[1], float(x[2])]) #Change plays into float

#Filter out values with more than 500 plays (for this sake of simplicity)
print ("length of cleaned data -",data2.count())
# data2 = data2.filter(lambda x : x[2] <= 500)
# print (data2.take(2))
print ("length of filtered data -",data2.count())

length of uncleaned data - 10000
length of cleaned data - 10000
length of filtered data - 10000


In [10]:
#Convert strings into integers
users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
# int_user = users.map(lambda u: (u[1], u[0]))
# int_artist = artists.map(lambda i: (i[1], i[0]))
# users.collect()
# artists.collect()

In [None]:
# data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
# data2.collect()

In [11]:
# Substitutes the ObjectIDs in the ratings RDD with the corresponding int values
data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))

In [12]:
# data2.filter(lambda x: x[0] == 12).collect()
plays = data2.map(lambda x: x[2]).collect()
# data2.collect()

In [None]:
# summation = 0
# for i in plays:
#     summation += i**2
# print (np.mean(plays)**2)
# summation / len(plays)

In [13]:
# Use 'Rating' function to get the values in the right format
data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
data2.count()

10000

In [17]:
# Use randomsplit to split the data into train, validation and testing sets

training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2],seed=2)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [15]:
# Define computeRMSE

def computeRMSE(model,data):
    
    """ Takes ALS models and testing data as input and returns RMSE value """
    
    data_for_predict = data.map(lambda x: (x[0], x[1]))
    
    predictions = model.predictAll(data_for_predict).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = data.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
    return error

In [None]:
# # Train ALS

# # Parameters
# seed = 5
# iterations = [15]
# regularization_parameter = [0.1]
# ranks = [20]
# # errors = [0, 0, 0,0,0]
# tolerance = 0.02
# alpha = 0.01

# #other variables initialized
# min_error = float('inf')
# bestModel = None
# bestValidationRmse = float("inf")
# bestRank = 0
# bestLambda = -1.0
# bestNumIter = -1
# err = 0

# # Train - Validation loop
# for rank, lambda_, iteration in itertools.product(ranks, regularization_parameter, iterations):
#     print (rank,lambda_,iteration)

#     model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iteration,
#                       lambda_=lambda_,alpha=alpha)

    
    
# #     predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
# #     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
# #     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
# #     errors[err] = error
# #     err += 1
# #     print ('For rank %s the RMSE is %s' % (rank, error))

#     RMSE = computeRMSE(model,validation_RDD)
    
#     if RMSE < bestValidationRmse:
#         bestValidationRmse = error
#         bestModel = model
#         bestRank = rank
#         bestLambda = lambda_
#         bestNumIter = iteration
        
#     # Test RMSE
#     testRMSE = computeRMSE(model,test_RDD)
    
    
# # evaluate the best model on the test set
# print ("The best model was trained with rank = %d and lambda = %.1f, " % (bestRank, bestLambda) \
#   + "and numIter = %d, and its RMSE on the test set is %f." % (bestNumIter, testRMSE))

# print ("Average of all ratings -", validation_RDD.map(lambda x : x[2]).mean())

# # print ('The best model was trained with rank %s' % best_rank)

In [20]:
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [5,10,15]
errors = [0, 0, 0]
err = 0
tolerance = 0.02
alpha = 0.01

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    e = 0
    for i in range(5):
        # Split the data
        training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2])
        validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
        test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))
        
        model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                          lambda_=regularization_parameter,alpha=alpha)
    #     predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    #     rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    #     error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
        e += computeRMSE(model,validation_RDD)
    
    error = e/5
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)

For rank 5 the RMSE is 0.16467579508178784
For rank 10 the RMSE is 0.1646758809725997
For rank 15 the RMSE is 0.16467539176613402
The best model was trained with rank 15


In [None]:
# print (bestRank,bestNumIter,bestLambda)

In [None]:
# Final Model

model = ALS.train(data2, 15, seed=seed, iterations= iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(validation_for_predict_RDD)

In [None]:
print (validation_RDD.count())
print (predictions.count())
computeRMSE(model,data2)

In [None]:
recos = model.recommendProducts(3503,20)
# recos

In [None]:
scores = []
for i in recos:
     scores.append(float(i[2]))

In [None]:
predictions.filter(lambda x : x[0] == 3503).collect()

In [None]:
# Save and load model
model.save(sc, "./model/als")