In [85]:
import math
import re
import numpy as np

from pyspark import SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [2]:
sc = SparkContext()

In [9]:
#Select the desired columns
data = sc.textFile('./lastfm-dataset-360k-small/merged-subset.csv').map(lambda x : [x.split(',')[i] for i in [1,2,4]])#
header = data.take(1)

In [11]:
# data.collect()

In [117]:
#Remove header
data2 = data.filter(lambda line: line!=header)
print ("length of uncleaned data -",len(data2.collect()))
# data2.map(lambda x : len(x[1])).collect()
data2 = data2.filter(lambda x : len(x[1]) == 36) #Clean data - remove artists without artistId

# Remove unclean rows
def isNumber(inputString):
    """ This return True if the string is pure number, False otherwise """
    return bool(re.search(r'\D', inputString))

data2 = data2.filter(lambda x: not isNumber(x[2])) # Remove faulty rows
data2 = data2.map(lambda x: [x[0], x[1], int(x[2])]) #Change plays into integer

#Filter out values with more than 500 plays (for this sake of simplicity)
print ("length of cleaned data -",len(data2.collect()))
data2 = data2.filter(lambda x : x[2] <= 500)
# print (data2.take(2))
print ("length of filtered data -",len(data2.collect()))

length of uncleaned data - 10001
length of cleaned data - 9858
length of filtered data - 8929


In [118]:
#Convert strings into integers
users = data2.map(lambda x: x[0]).distinct().zipWithIndex()
artists = data2.map(lambda x: x[1]).distinct().zipWithIndex()
# int_user = users.map(lambda u: (u[1], u[0]))
# int_artist = artists.map(lambda i: (i[1], i[0]))
# users.collect()
# artists.collect()

In [66]:
# data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
# data2.collect()

In [119]:
# Substitutes the ObjectIDs in the ratings RDD with the corresponding int values
data2 = data2.map(lambda r: (r[0], (r[1], r[2]))).join(users).map(lambda r: (r[1][1], r[1][0][0], r[1][0][1]))
data2 = data2.map(lambda r: (r[1], (r[0], r[2]))).join(artists).map(lambda r: (r[1][0][0], r[1][1], r[1][0][1]))

In [120]:
# data2.filter(lambda x: x[0] == 12).collect()
plays = data2.map(lambda x: x[2]).collect()
data2.collect()

[(12, 11, 168),
 (18, 23, 178),
 (26, 29, 17),
 (5571, 29, 38),
 (31, 31, 59),
 (37, 42, 26),
 (4039, 42, 66),
 (4371, 42, 483),
 (50, 55, 228),
 (6089, 55, 98),
 (53, 39, 14),
 (8481, 39, 103),
 (34, 39, 262),
 (1830, 39, 44),
 (2937, 39, 100),
 (5063, 39, 39),
 (6031, 39, 203),
 (8443, 39, 5),
 (65, 75, 48),
 (68, 77, 5),
 (1167, 77, 124),
 (2262, 77, 45),
 (8076, 77, 3),
 (5358, 77, 51),
 (94, 100, 172),
 (97, 101, 49),
 (409, 101, 81),
 (2988, 101, 151),
 (3253, 101, 132),
 (3652, 101, 378),
 (3739, 101, 216),
 (4202, 101, 387),
 (4305, 101, 104),
 (4744, 101, 254),
 (4987, 101, 235),
 (5177, 101, 462),
 (5861, 101, 351),
 (6472, 101, 483),
 (8291, 101, 284),
 (458, 101, 164),
 (684, 101, 297),
 (2946, 101, 341),
 (3405, 101, 172),
 (3670, 101, 109),
 (3730, 101, 313),
 (4512, 101, 135),
 (4520, 101, 324),
 (4530, 101, 219),
 (5839, 101, 111),
 (6367, 101, 268),
 (6742, 101, 65),
 (6812, 101, 180),
 (7407, 101, 369),
 (7523, 101, 253),
 (7557, 101, 271),
 (8196, 101, 163),
 (107, 1

In [121]:
summation = 0
for i in plays:
    summation += i**2
print (np.mean(plays)**2)
summation / len(plays)

14359.3343839


27803.982528838616

In [122]:
# Use 'Rating' function to get the values in the right format
data2 = data2.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
len(data2.collect())

8929

In [123]:
# Use randomsplit to split the data into train, validation and testing sets

training_RDD, validation_RDD, test_RDD = data2.randomSplit([6, 2, 2], seed=0)
validation_for_predict_RDD = validation_RDD.map(lambda x: (x[0], x[1]))
test_for_predict_RDD = test_RDD.map(lambda x: (x[0], x[1]))

In [124]:
training_RDD.collect()

[Rating(user=12, product=11, rating=168.0),
 Rating(user=18, product=23, rating=178.0),
 Rating(user=31, product=31, rating=59.0),
 Rating(user=6089, product=55, rating=98.0),
 Rating(user=1830, product=39, rating=44.0),
 Rating(user=2937, product=39, rating=100.0),
 Rating(user=5063, product=39, rating=39.0),
 Rating(user=68, product=77, rating=5.0),
 Rating(user=2262, product=77, rating=45.0),
 Rating(user=5358, product=77, rating=51.0),
 Rating(user=94, product=100, rating=172.0),
 Rating(user=409, product=101, rating=81.0),
 Rating(user=3652, product=101, rating=378.0),
 Rating(user=3739, product=101, rating=216.0),
 Rating(user=4305, product=101, rating=104.0),
 Rating(user=4744, product=101, rating=254.0),
 Rating(user=5177, product=101, rating=462.0),
 Rating(user=5861, product=101, rating=351.0),
 Rating(user=6472, product=101, rating=483.0),
 Rating(user=684, product=101, rating=297.0),
 Rating(user=2946, product=101, rating=341.0),
 Rating(user=3405, product=101, rating=172.0

In [92]:
# Train ALS

# Parameters
seed = 5
iterations = 10
regularization_parameter = 0.1
ranks = [10,11,12,13,14]
errors = [0, 0, 0,0,0]
tolerance = 0.02
alpha = 0.01

#other variables initialized
min_error = float('inf')
best_rank = -1
best_iteration = -1
err = 0

# Train - Validation loop

for rank in ranks:
    model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter,alpha=alpha)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print ('For rank %s the RMSE is %s' % (rank, error))
    if error < min_error:
        min_error = error
        best_rank = rank

print ('The best model was trained with rank %s' % best_rank)

For rank 10 the RMSE is 1257.7542227681429
For rank 11 the RMSE is 1257.7496258119024
For rank 12 the RMSE is 1257.7507140954563
For rank 13 the RMSE is 1257.7556182633493
For rank 14 the RMSE is 1257.7542959534878
The best model was trained with rank 11


In [125]:
# Final Model
rank = 11

model = ALS.trainImplicit(training_RDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter,alpha=alpha)
predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = validation_RDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())

print ('For rank %s the RMSE is %s' % (rank, error))

For rank 11 the RMSE is 153.0455153224657


In [135]:
print(len(data2.collect()))
values = data2.map(lambda x: (x[0], x[1]))
print(len(values.collect()))
predictions = model.predictAll(values)
print(len(predictions.collect()))

8929
8929


5401

In [129]:
predictions.filter(lambda x : x[1] == 745).collect()

[Rating(user=5360, product=745, rating=1.2508372957566913e-17)]

In [128]:
data2.filter(lambda x : x[0] == 964).collect()

[Rating(user=964, product=3189, rating=136.0)]