In [None]:
from utils import *
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, from_unixtime, length, year, col, array_size, udf, avg, collect_list, row_number
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, IntegerType, StringType
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, BucketedRandomProjectionLSH
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector

from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, StopWordsCleaner, LemmatizerModel, SentenceDetectorDLModel, NorvigSweetingModel, BertSentenceEmbeddings

In [6]:
spark = (SparkSession.builder
         .appName("Books")
         .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:6.0.1")
         .getOrCreate())

In [None]:
"""
Bigger dataset: big_only_fiction (6750 x 135), big_full (6750 x 135)
Trial dataset: trial_full (800 x 16)
"""
chosen_data = "big_only_fiction"
df_reviews, df_descriptions = load_data(spark, chosen_data)

In [None]:
books_number = df_reviews.groupBy("book_title").count()
reviews_number = df_reviews.count()
print(f"Number of rows (reviews): {reviews_number}\nNumber of books: {books_number.count()}")

In [None]:
books_number.sort("book_title", ascending = True).show(150, truncate = False)

In [None]:
embedded_reviews = embedding_pipeline(df_reviews)
grouped_reviews = group_vectors(embedded_reviews)
similar_reviews = compute_similarity(grouped_reviews).cache()

print("\n" + "-" * 50)
print(f"\nSHOWING SIMILAR BOOKS BASED ON USERS REVIEWS\nChosen dataset: {chosen_data}\n")
similar_reviews.show(5, truncate = False)

In [None]:
embedded_descriptions = embedding_pipeline(df_descriptions)
grouped_descriptions = group_vectors(embedded_descriptions)
similar_descriptions = compute_similarity(grouped_descriptions).cache()

print("\n" + "-" * 50)
print(f"\nSHOWING SIMILAR BOOKS BASED ON BOOK DESCRIPTION\nChosen dataset: {chosen_data}\n")
similar_descriptions.show(5, truncate = False)

In [None]:
chosen_book = "Little Women"

books_list_rev = similar_reviews.filter(col("book1") == chosen_book)

if books_list_rev.isEmpty():
  print("Couldn't find the book. Try another one!")
else:
  print(f"Top 3 most similar books to {chosen_book}, according to user reviews:\n")
  books_list_rev.show(3, truncate = False)

In [None]:
chosen_book = "Little Women"

books_list_descr = similar_descriptions.filter(col("book1") == chosen_book)

if books_list_descr.isEmpty():
  print("Couldn't find the book. Try another one!")
else:
  print(f"Top 3 most similar books to {chosen_book}, according to book description:\n")
  books_list_descr.show(3, truncate = False)

In [None]:
first_book = "Little Women"
second_book = "The Picture of Dorian Gray"

books_rev_1on1 = similar_reviews.filter((col("book1") == first_book) & (col("book2") == second_book))

if books_rev_1on1.isEmpty():
  print("Couldn't find the books. Try other ones!")
else:
  print(f"Cosine similarity between {first_book} and {second_book}, based on user reviews:\n")
  books_rev_1on1.show(truncate = False)