In [1]:

book_url = 'http://www2.informatik.uni-freiburg.de/~cziegler/BX/BX-CSV-Dump.zip'
data_path = '/dbfs/FileStore/data'

In [2]:
import urllib
book_crossing = urllib.urlretrieve (book_url, 'book_crossing.zip')

In [3]:
import zipfile

with zipfile.ZipFile('/dbfs/FileStore/book_crossing.zip', "r") as z:
    z.extractall(data_path)

In [4]:
import os
os.listdir(data_path)

In [5]:
import pandas as pd
ratings_df = pd.read_csv("/dbfs/FileStore/data/BX-Book-Ratings.csv", header=0,sep=';', error_bad_lines=False)
colu = ["ISBN","Book-Title","Book-Author"]
books_df = pd.read_csv("/dbfs/FileStore/data/BX-Books.csv", header=0,sep=';', error_bad_lines=False,usecols=colu)

In [6]:
complete_df=ratings_df.merge(books_df)
complete_df['itemID'] = complete_df['ISBN'].astype('category').cat.codes

In [7]:
complete_df.columns

In [8]:
without_implicit_df = complete_df[complete_df['Book-Rating']!= 0]
without_explicit_df = complete_df[complete_df['Book-Rating']==0]

In [9]:
uire = sqlContext.createDataFrame(without_implicit_df).rdd.map(lambda x: (x[0],x[5],x[2]))

In [10]:
uire.take(3)

In [11]:
#ratings_rdd = sqlContext.createDataFrame(pandas_df).rdd.map(tuple)
#books_rdd = sqlContext.createDataFrame(pandas_df).rdd.map(tuple)

In [12]:
training,validation,test = uire.randomSplit([6.0,2.0,2.0],seed = 26)
validation_blank = validation.map(lambda x: (x[0],x[1]))
test_blank = test.map(lambda x: (x[0],x[1]))


In [13]:
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
ranks = [4,7,10,13,16,19,22,28,32]
numIterations = 10
MSElist = []
for rank in ranks:
  model = ALS.train(training, rank, numIterations)
  predictions = model.predictAll(validation_blank).map(lambda z: ((z[0],z[1]),z[2]))
  rates_and_predictions = validation.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
  MSE = rates_and_predictions.map(lambda r: (r[1][0] - r[1][1])**2).mean()
  MSElist.append(MSE)

In [14]:
for MSE in MSElist:
  print MSE**.5

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

als = ALS(maxIter=10, regParam=0.01, userCol="User-ID", itemCol="itemID", ratingCol="Book-Rating")
#User-ID Book-Rating itemID Book-Rating

In [16]:
uire_df = spark.createDataFrame(without_implicit_df)
(training_df, test_df) = uire_df.randomSplit([0.8, 0.2])
model_2 = als.fit(training_df)
predictions_2 = model_2.transform(test_df)
predictions_2 = predictions_2.na.drop(subset='prediction')
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions_2)
print("Root-mean-square error = " + str(rmse))

In [17]:
predictions_2.take(3)

In [18]:
from surprise import SVD
from surprise import Dataset
from surprise import evaluate, print_perf
from surprise import Reader

In [19]:
explicit_df = complete_df[complete_df['Book-Rating']>0]

In [20]:
explicit_small_df = explicit_df.ix[:len(explicit_df)*1.3]

In [21]:
data_for_svd = explicit_small_df[['User-ID','itemID','Book-Rating']]
data_for_svd.to_csv('/dbfs/FileStore/data/ratings_for_svd.csv',sep=';',index=False)

In [22]:
lol = Reader(rating_scale=(1,10),sep=';',line_format=('user item rating'),skip_lines=1)
data = Dataset.load_from_file('/dbfs/FileStore/data/ratings_for_svd.csv',reader=lol)

In [23]:
data.split(n_folds=3)

In [24]:
algo = SVD()

In [25]:
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])

In [26]:
complete_df['Book-Rating'] = complete_df['Book-Rating'].replace([0],9)

In [27]:
complete_adjusted_df = spark.createDataFrame(complete_df)

In [28]:
als = ALS(maxIter=10, regParam=0.01, userCol="User-ID", itemCol="itemID", ratingCol="Book-Rating")
(training_df, test_df) = complete_adjusted_df.randomSplit([0.8, 0.2])
model_3 = als.fit(training_df)
predictions_3 = model_3.transform(test_df)
predictions_3 = predictions_3.na.drop(subset='prediction')
evaluator = RegressionEvaluator(metricName="rmse", labelCol="Book-Rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions_3)
print("Root-mean-square error = " + str(rmse))

In [29]:
One experiment to be performed is to bring in the Location data of users, and create relational increments based on distance between users using any standard distance metric; to form user neighborhoods and the inverse of these measures to create better serendipity in the system (I believe relevant variety can be found through physical distance and implicit cultural divides; while retaining the same instinctual human likes / dislikes)

In [30]:
If online evaluation was possible; I could measure the success of different algorithims by the Click Through Rate and Conversion Rate; perhaps if the CTR was high and the CR was low; I may need to find more ways to increase serendipity or diversity.