In [None]:
#111111111222222222233333333334444444444555555555566666666667777777777888888888899999999990000000000

In [2]:
# create a spark session
import pyspark
spark_context = pyspark.SparkContext()
spark_session = pyspark.sql.SparkSession(spark_context)

In [3]:
# load csv file into dataframe
df = spark_session.read.csv("ratings.csv", header=True,\
                    sep=",", inferSchema=True)
df.show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
+-------+-------+------+
only showing top 20 rows



In [4]:
# Print the schema to get the column names
df.printSchema()

root
 |-- book_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [5]:
# Split data into training and test dataframe
df_train, df_test = df.randomSplit([0.8, 0.2],seed=5)

In [6]:
# Train via collaborative filtering using ALS modeling
from pyspark.ml.recommendation import ALS, ALSModel
als = ALS(maxIter=1, \
          userCol="user_id", itemCol="book_id", \
          ratingCol="rating", coldStartStrategy="drop")
model = als.fit(df_train)

In [7]:
# Predict ratings on the test set
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(df_test)
predictions.show()

+-------+-------+------+------------+
|book_id|user_id|rating|  prediction|
+-------+-------+------+------------+
|    148|  35982|     3| -0.31140882|
|    148|    588|     4|  0.92572826|
|    148|   6630|     3|   0.4292102|
|    148|  32055|     3|0.0058147907|
|    148|   5461|     4|   3.0499282|
|    148|  29703|     4| 0.041774243|
|    148|   8440|     3|    2.459359|
|    148|  29031|     3|   0.6628299|
|    148|   8510|     3|   2.2513313|
|    148|   8579|     3|    2.166421|
|    148|  51166|     4|   2.3413692|
|    148|  25840|     3|  0.36833537|
|    148|  27027|     5|   2.1993814|
|    148|  19191|     4|   2.2397602|
|    148|   9246|     3|   0.7226782|
|    148|  30681|     2| 0.055203795|
|    148|   3005|     5|   4.8201756|
|    148|  11945|     4|   1.7858065|
|    148|  23612|     4|  0.64917624|
|    148|  37834|     3|   1.6805291|
+-------+-------+------+------------+
only showing top 20 rows



In [8]:
# Evaluate the predictions using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",\
        predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.973465155014813


In [9]:
# Train via collaborative filtering using ALS modeling
from pyspark.ml.recommendation import ALS, ALSModel
als = ALS(maxIter=2, \
          userCol="user_id", itemCol="book_id", \
          ratingCol="rating", coldStartStrategy="drop")
model = als.fit(df_train)

In [10]:
# Predict ratings on the test set
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(df_test)
predictions.show()

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|  35982|     3|  2.446461|
|    148|    588|     4| 2.9079287|
|    148|   6630|     3| 3.0344589|
|    148|  32055|     3| 2.7123644|
|    148|   5461|     4| 3.7742112|
|    148|  29703|     4|  3.292892|
|    148|   8440|     3| 2.8392575|
|    148|  29031|     3| 2.9192214|
|    148|   8510|     3|  2.819971|
|    148|   8579|     3| 2.8995452|
|    148|  51166|     4| 3.6051712|
|    148|  25840|     3|  3.071086|
|    148|  27027|     5| 2.5246549|
|    148|  19191|     4|  3.070326|
|    148|   9246|     3| 2.7046995|
|    148|  30681|     2|  2.723502|
|    148|   3005|     5| 3.4284286|
|    148|  11945|     4| 3.3822477|
|    148|  23612|     4| 3.1713557|
|    148|  37834|     3| 3.5440595|
+-------+-------+------+----------+
only showing top 20 rows



In [11]:
# Evaluate the predictions using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",\
        predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 2.305931931499659


In [12]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    148|[[3033, 7.394059]...|
|    463|[[9529, 8.791248]...|
|    471|[[3491, 4.8327417...|
|    496|[[9529, 8.452917]...|
|    833|[[5354, 6.7187777...|
|   1088|[[7728, 5.5623875...|
|   1238|[[1618, 3.9393585...|
|   1342|[[3172, 7.2099924...|
|   1580|[[5205, 7.797331]...|
|   1591|[[9769, 5.4881206...|
|   1645|[[9096, 5.158281]...|
|   1829|[[2586, 9.483122]...|
|   1959|[[9345, 2.1017418...|
|   2122|[[7281, 5.4459467...|
|   2142|[[9096, 8.911084]...|
|   2366|[[1495, 8.855446]...|
|   2659|[[8859, 6.805735]...|
|   2866|[[4291, 9.066534]...|
|   3175|[[9479, 7.6684184...|
|   3749|[[3604, 7.6168547...|
+-------+--------------------+
only showing top 20 rows



In [13]:
# Load the books CSV
books_df = spark_session.read.csv("books.csv", header=True,\
                    sep=",", inferSchema=True)

# Grab the first user's recommendations, and get the book names
recs = userRecs.first().recommendations

# List the details of the top 10 recommendations for this user
from pyspark.sql.functions import col
for pair in recs:
    book_id = pair.book_id
    rating = pair.rating
    title = books_df[ books_df.id == book_id ].select( col("title"))
    print( "rating=%1.1f, %s" % ( rating, title.first().title ) )

rating=7.4, The Right Stuff
rating=6.8, The Dispossessed
rating=6.8, You're Never Weird on the Internet (Almost)
rating=6.8, The Intelligent Investor (Collins Business Essentials)
rating=6.8, Born to Run: A Hidden Tribe, Superathletes, and the Greatest Race the World Has Never Seen
rating=6.7, Watchmen
rating=6.7, V for Vendetta
rating=6.6, One True Thing
rating=6.6, Strength in What Remains: A Journey of Remembrance and Forgiveness
rating=6.6, The Hobbit: Graphic Novel


In [14]:
# Train via collaborative filtering using ALS modeling
from pyspark.ml.recommendation import ALS, ALSModel
als = ALS(maxIter=4, \
          userCol="user_id", itemCol="book_id", \
          ratingCol="rating", coldStartStrategy="drop")
model = als.fit(df_train)

# Predict ratings on the test set
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(df_test)
predictions.show(10)

# Evaluate the predictions using RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",\
        predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    148|  35982|     3| 2.9441757|
|    148|    588|     4| 3.2421308|
|    148|   6630|     3|   3.47851|
|    148|  32055|     3| 3.1640284|
|    148|   5461|     4| 3.7821462|
|    148|  29703|     4|  3.990831|
|    148|   8440|     3| 3.0593615|
|    148|  29031|     3| 3.5942922|
|    148|   8510|     3|  3.276093|
|    148|   8579|     3| 3.0420232|
+-------+-------+------+----------+
only showing top 10 rows

Root-mean-square error = 1.0619829028186363


In [15]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
userRecs.show(1)

# Grab the first user's recommendations, and get the book names
recs = userRecs.first().recommendations

# List the details of the top 10 recommendations for this user
from pyspark.sql.functions import col
for pair in recs:
    title = books_df[ books_df.id == pair.book_id ].select( col("title"))
    print( "rating=%1.1f, %s" % ( pair.rating, title.first().title ) )

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    148|[[2865, 4.6972713...|
+-------+--------------------+
only showing top 1 row

rating=4.7, The Gashlycrumb Tinies (The Vinegar Works, #1)
rating=4.7, The Complete Anne of Green Gables Boxed Set (Anne of Green Gables, #1-8)
rating=4.7, The Power of One (The Power of One, #1)
rating=4.7, Harry Potter Collection (Harry Potter, #1-6)
rating=4.6, Lamb: The Gospel According to Biff, Christ's Childhood Pal
rating=4.6, The Complete Calvin and Hobbes
rating=4.6, Between the World and Me
rating=4.5, Words of Radiance (The Stormlight Archive, #2)
rating=4.5, The Warmth of Other Suns: The Epic Story of America's Great Migration
rating=4.5, The Night Before Christmas
