<a href="https://colab.research.google.com/github/livingMabhijit/Recommendation_repo/blob/master/Recommend_1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pyspark as ps
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator,ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf,col,when
import numpy as np


In [0]:
#pip install pyspark

In [0]:
from IPython.display import Image
from IPython.display import display


#spark session creation
spark = ps.sql.SparkSession.builder \
        .master('local') \
        .appName('book') \
        .getOrCreate()
sc= spark.sparkContext
sqlContext = SQLContext(sc)

In [6]:
rating_df = spark.read.csv('/content/drive/My Drive/datasets/ratings.csv',header=True,inferSchema=True)
rating_df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [7]:
rating_df.show(5)

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
+-------+-------+------+
only showing top 5 rows



In [8]:
#book data
book_df =spark.read.csv('/content/drive/My Drive/datasets/books.csv',header=True,inferSchema=True)
book_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [9]:
book_df.show(2)

+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------+--------------------+
| id|book_id|best_book_id|work_id|books_count|     isbn|          isbn13|             authors|original_publication_year|      original_title|               title|language_code|average_rating|ratings_count|work_ratings_count|work_text_reviews_count|ratings_1|ratings_2|ratings_3|ratings_4|ratings_5|           image_url|     small_image_url|
+---+-------+------------+-------+-----------+---------+----------------+--------------------+-------------------------+--------------------+--------------------+-------------+--------------+-------------+------------------+-----------------------+---------+---------+---------+---------+---------+--------------------

In [0]:
#splitting train and test data
train_df,val_df = rating_df.randomSplit([0.8,0.2])
#parameter init
iterations = 12
reg_param = 0.1 #lambda
rank = 5 #rank matrix
errors = []
err = 0

In [11]:
#model build
als = ALS(rank=rank,maxIter=iterations,regParam=reg_param,userCol='user_id',itemCol='book_id',ratingCol='rating')
model  = als.fit(train_df)
pred = model.transform(val_df)
new_pred = pred.filter(col('prediction')!=np.nan)
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')
rmse = evaluator.evaluate(new_pred)
print(rmse)

0.8969607880591141


In [4]:
 ###in case you get error for java, here is the code
import os #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64" #set environment variable
  !java -version #check java version
  # !apt-get install openjdk-8-jdk-headless -qq > /dev/null
  # import os
  # os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
  # !update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
  # !java -version
install_java()

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


In [1]:
!java -version

openjdk version "1.8.0_252"
OpenJDK Runtime Environment (build 1.8.0_252-8u252-b09-1~18.04-b09)
OpenJDK 64-Bit Server VM (build 25.252-b09, mixed mode)


Let's see Cross validation in action.

In [0]:
als_cv = ALS(rank=rank,maxIter=iterations,regParam=reg_param,userCol='user_id',itemCol='book_id',ratingCol='rating')
ParamGrid = ParamGridBuilder() \
    .addGrid(als_cv.regParam,[0.01,0.1,1.0]) \
    .addGrid(als_cv.rank,[4,5,6,7]) \
    .build()
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')
crossval = CrossValidator(
    estimator = als_cv,
    estimatorParamMaps = ParamGrid,
    evaluator = evaluator,
    numFolds = 4
)
cvmodel = crossval.fit(train_df)

In [12]:
prediction = model.transform(val_df)
prediction.show(5)

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|   3087|     3| 3.5487852|
|    148|  26629|     4| 3.8445916|
|    148|   9165|     3|  3.721115|
|    148|  22164|     3|  3.319882|
|    148|  24326|     5|  3.479077|
+-------+-------+------+----------+
only showing top 5 rows



In [13]:
prediction.join(book_df,'book_id').select('user_id','title','prediction').show(5)

+-------+----------+----------+
|user_id|     title|prediction|
+-------+----------+----------+
|  15161|Lysistrata| 3.5407095|
|   2331|Lysistrata| 3.3007188|
|  10136|Lysistrata|  3.657051|
|  51663|Lysistrata|  3.840766|
|  36623|Lysistrata| 3.9733436|
+-------+----------+----------+
only showing top 5 rows



In [14]:
one_user = prediction.filter(col('user_id')==12014).join(book_df,'book_id').select('user_id','title','image_url','prediction').show(5)

+-------+---------------+--------------------+----------+
|user_id|          title|           image_url|prediction|
+-------+---------------+--------------------+----------+
|  12014|Never Let Me Go|https://images.gr...| 3.6355517|
+-------+---------------+--------------------+----------+



In [15]:
for book in one_user.take(5):
  print(book.title)
  display(Image(url = book.image_url))


AttributeError: ignored

In [0]:
#user recommendation for all books (4 user each)
user_recom = model.recommendForAllUsers(4)
#book recommendation for all users (4 user each)
book_recom = model.recommendForAllItems(4)

In [17]:
user_recom.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [18]:
user_recom.select('user_id','recommendations.book_id').show(4,False)

+-------+------------------------+
|user_id|book_id                 |
+-------+------------------------+
|148    |[4868, 7593, 6751, 8362]|
|463    |[5088, 9806, 4778, 6980]|
|471    |[8854, 8120, 642, 2236] |
|496    |[1338, 7242, 7499, 5325]|
+-------+------------------------+
only showing top 4 rows



In [19]:
book_recom.select('book_id','recommendations.user_id').show(4,False)

+-------+----------------------------+
|book_id|user_id                     |
+-------+----------------------------+
|1580   |[7287, 30699, 37073, 3523]  |
|4900   |[46796, 50062, 48324, 36411]|
|5300   |[52593, 51361, 22259, 10482]|
|6620   |[30699, 34848, 34547, 33118]|
+-------+----------------------------+
only showing top 4 rows



In [20]:
#for number of users not for all
users = rating_df.select('user_id').distinct().limit(2).show()

+-------+
|user_id|
+-------+
|  32592|
|  19984|
+-------+



In [0]:
#recommending 5 books for given user ids
#userSubsetRecs = model.recommendForUserSubset(users, 5)
# user_recom_sub.show()

In [24]:
book_subset = book_df.select('book_id').distinct().limit(5).show()

+--------+
| book_id|
+--------+
|17802724|
|    2122|
|    4900|
|  121749|
|   33722|
+--------+



In [0]:
#book_sub_rec = model.recommendForItemSubset(book_subset, 10)