In [6]:
import sys, operator, time, os
from collections import defaultdict
import numpy as np

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

%matplotlib inline
print os.environ["PYSPARK_SUBMIT_ARGS"]
print sc._conf.get('spark.driver.memory')
print sc._conf.get('spark.executor.memory')
print sc._conf.getAll()

# sc.set('spark.driver.memory', '5g')
sqlContext = SQLContext(sc)

base_dir = 'ml-10M100K/'
ratings_dir = base_dir + 'ratings.dat'
movies_dir = base_dir + 'movies.dat'
tags_dir = base_dir + 'tags.dat'


--master local[*] --driver-memory 5g --executor-memory 5g  pyspark-shell
5g
None
[(u'spark.rdd.compress', u'True'), (u'spark.serializer.objectStreamReset', u'100'), (u'spark.master', u'local[*]'), (u'spark.submit.deployMode', u'client'), (u'spark.driver.memory', u'5g'), (u'spark.app.name', u'pyspark-shell')]


In [156]:
# load data as rdd
ratings_rdd = sc.textFile(ratings_dir)\
    .map(lambda r: [float(e) if i == 2 else e for i, e in enumerate(r.split('::'))])\
    .sample(False, 0.1, int(time.time()))
movies_rdd = sc.textFile(movies_dir).map(lambda r: r.split('::'))
tags_rdd = sc.textFile(tags_dir).map(lambda r: r.split('::'))

In [3]:
# user_count = ratings_rdd.map(lambda r: r[0]).countByValue()
# movie_count = ratings_rdd.map(lambda r: r[1]).countByValue()

In [5]:
# rating_mean = ratings_rdd.map(lambda r: float(r[2])).mean()
# print rating_mean

3.51254659056


In [223]:
# for testing
# test_rdd = sc.parallelize(ratings_rdd.take(1000))

In [226]:
# get mean for a grouped key result
def get_mean(r):
    return np.array(list(r))[:, 2].astype(float).mean()

# remove bias for one sample
def remove_sample_bias(r, user_means, movie_means, global_mean):
    baseline = r[2] + (user_means[r[0]] - global_mean) + (movie_means[r[1]] - global_mean)
    r[2] = round(min(max(baseline, 0), 5), 2) # bound and round rating
    return r

def remove_bias(data):
    print 'Getting means'
    # global mean
    global_mean = data.map(lambda r: r[2]).mean()
    print global_mean
    
    # user means
    user_means = data.groupBy(lambda r: r[0]).mapValues(lambda r: get_mean(r)).collect()
    user_means = dict(user_means)
    # print user_means

    # user means
    movie_means = data.groupBy(lambda r: r[1]).mapValues(lambda r: get_mean(r)).collect()
    movie_means = dict(movie_means)
    # print movie_means
    
    print 'Removing bias'
    return data.map(lambda r: remove_sample_bias(r, user_means, movie_means, global_mean))

In [230]:
ratings_rdd_unbiased = remove_bias(ratings_rdd)

In [233]:
def train_als(data):
    # map ratings into Ratings object comprised of [user, movie, rating]
    data = data.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    rank = 10
    numIterations = 10
    model = ALS.train(data, rank, numIterations)
    return model, data

In [234]:
def evaluate(data, model):
    # Evaluate the model on training data
    testdata = data.map(lambda p: (p[0], p[1]))

    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = data.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))

In [11]:
print 'Training als'
model, data = train_als(ratings_rdd_unbiased)
print 'Evaluating'
evaluate(data, model)

In [None]:
print 'Training als'
model, data = train_als(ratings_rdd_unbiased)
print 'Evaluating'
evaluate(data, model)