# Imports

In [None]:
from utils import *

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap.umap_ as umap

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, when, from_unixtime, length, year, col, array_size, udf, avg, collect_list, row_number, expr, collect_list, concat_ws
from pyspark.sql.types import ArrayType, FloatType, StructType, StructField, IntegerType, StringType
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, BucketedRandomProjectionLSH, Normalizer, HashingTF, MinHashLSH
from pyspark.ml.linalg import Vectors, VectorUDT, DenseVector

from sparknlp.pretrained import PretrainedPipeline
from sparknlp.base import DocumentAssembler, Finisher
from sparknlp.annotator import Tokenizer, StopWordsCleaner, LemmatizerModel, SentenceDetectorDLModel, NorvigSweetingModel, BertSentenceEmbeddings

# Choose data

In [6]:
spark = (SparkSession.builder
         .appName("Books")
         .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:6.0.1")
         .getOrCreate())

In [None]:
"""
Bigger dataset: big_only_fiction (6750 x 135), big_full (6750 x 135)
Trial dataset: trial_full (800 x 16)
"""
chosen_data = "big_only_fiction"
df_reviews, df_descriptions = load_data(spark, chosen_data)

In [None]:
books_number = df_reviews.groupBy("book_title").count()
reviews_number = df_reviews.count()
print(f"Number of rows (reviews): {reviews_number}\nNumber of books: {books_number.count()}")

In [None]:
books_number.sort("book_title", ascending = True).show(150, truncate = False)

# Check Jaccard similarity of the books


In [None]:
customized_reviews_grouped = group_text(df_reviews)
customized_reviews = custom_pipeline(customized_reviews_grouped)
similar_reviews_jaccard = compute_jaccard_similarity(customized_reviews).cache()

print("\n" + "-" * 70)
print(f"\nSHOWING SIMILAR BOOKS BASED ON USERS REVIEWS - METRIC: JACCARD SIMILARITY\nChosen dataset: {chosen_data}\n")
similar_reviews_jaccard.show(5, truncate = False)

In [None]:
customized_descriptions_grouped = group_text(df_descriptions)
customized_descriptions = custom_pipeline(customized_descriptions_grouped)
similar_descriptions_jaccard = compute_jaccard_similarity(customized_descriptions).cache()

print("\n" + "-" * 70)
print(f"\nSHOWING SIMILAR BOOKS BASED ON BOOK DESCRIPTION - METRIC: COSINE SIMILARITY\nChosen dataset: {chosen_data}\n")
similar_descriptions_jaccard.show(5, truncate = False)

## Check cosine similarity of the books

In [None]:
embedded_reviews = pretrained_pipeline(df_reviews)
embedded_reviews_grouped = group_vectors(embedded_reviews)
similar_reviews_cosine = compute_cosine_similarity(embedded_reviews_grouped).cache()

print("\n" + "-" * 70)
print(f"\nSHOWING SIMILAR BOOKS BASED ON USERS REVIEWS - METRIC: COSINE SIMILARITY\nChosen dataset: {chosen_data}\n")
similar_reviews_cosine.show(5, truncate = False)

In [None]:
embedded_descriptions = pretrained_pipeline(df_descriptions)
embedded_descriptions_grouped = group_vectors(embedded_descriptions)
similar_descriptions_cosine = compute_cosine_similarity(embedded_descriptions_grouped).cache()

print("\n" + "-" * 70)
print(f"\nSHOWING SIMILAR BOOKS BASED ON BOOK DESCRIPTION - METRIC: COSINE SIMILARITY\nChosen dataset: {chosen_data}\n")
similar_descriptions_cosine.show(5, truncate = False)

In [None]:
chosen_book = "Little Women"

books_list_rev = similar_reviews_cosine.filter(col("book1") == chosen_book)

if books_list_rev.isEmpty():
  print("Couldn't find the book. Try another one!")
else:
  print(f"Top 3 most similar books to {chosen_book}, according to user reviews:\n")
  books_list_rev.show(3, truncate = False)

if books_list_rev.count() < 3:
  print("\nLess than 3 books are showing, since the similarity with other books is too low.")

In [None]:
chosen_book = "Little Women"

books_list_descr = similar_descriptions_cosine.filter(col("book1") == chosen_book)

if books_list_descr.isEmpty():
  print("Couldn't find the book. Try another one!")
else:
  print(f"Top 3 most similar books to {chosen_book}, according to book description:\n")
  books_list_descr.show(3, truncate = False)

if books_list_descr.count() < 3:
  print("\nLess than 3 books are showing, since the similarity with other books is too low.")

In [None]:
first_book = "Little Women"
second_book = "Plainsong"

books_rev_1on1 = similar_reviews_cosine.filter((col("book1") == first_book) & (col("book2") == second_book))

if books_rev_1on1.isEmpty():
  print("Couldn't find the books. Try other ones!")
else:
  print(f"Cosine similarity between {first_book} and {second_book}, based on user reviews:\n")
  books_rev_1on1.show(truncate = False)

# Visualizations

In [None]:
grouped_reviews_pd = grouped_reviews.toPandas()

norm_embeddings = np.vstack(grouped_reviews_pd["norm_embedding"].values)
norm_embeddings = StandardScaler().fit_transform(norm_embeddings)

book_titles = grouped_reviews_pd["book_title"].values

In [None]:
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
norm_embeddings_2d = umap_model.fit_transform(norm_embeddings)

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(x = norm_embeddings_2d[:, 0], y = norm_embeddings_2d[:, 1], hue = book_titles, legend = False)

for i, label in enumerate(book_titles):
    plt.text(norm_embeddings_2d[i, 0], norm_embeddings_2d[i, 1], label[:15], fontsize=8)

plt.title("2D visualization of book embeddings\n")
plt.xlabel("\nDimension 1")
plt.ylabel("Dimension 2\n")
plt.show()