In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
import os
import sys
from pyspark.sql.functions import to_timestamp, col, trim, split, regexp_replace, regexp_extract, udf, when, explode, lower, from_json, collect_list, collect_set
from calendar import month_name
import re
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.sql.types import *
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.sql.functions import udf
from pyspark.ml.linalg import DenseVector

In [2]:
spark = SparkSession.builder \
    .appName('app_name') \
    .master('local[*]') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .config('spark.sql.session.timeZone', 'UTC') \
    .config('spark.driver.memory','32G') \
    .config('spark.ui.showConsoleProgress', True) \
    .config('spark.sql.repl.eagerEval.enabled', True) \
    .getOrCreate()

In [3]:
books_data_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load("DE_CaseStudy_Dataset/Sales_Data/books_data.csv")

# books_data_df.count()

In [4]:
books_data_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+
|               Title|         description|             authors|               image|         previewLink|           publisher|publishedDate|            infoLink|          categories|ratingsCount|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+------------+
|Its Only Art If I...|                NULL|    ['Julie Strain']|http://books.goog...|http://books.goog...|                NULL|         1996|http://books.goog...|['Comics & Graphi...|        NULL|
|Dr. Seuss: Americ...|Philip Nel takes ...|      ['Philip Nel']|http://books.goog...|http://books.goog...|           A&C Black|   2005-01-01|http://books.goog...|['Biography & Aut...|        NULL|
|Wonderful Wors

In [5]:
# books_data_df.describe().show()

In [6]:
books_mapped_df = books_data_df.select(
    trim(col("Title")).alias("title"),
    trim(col("description")).alias("description"),
    trim(col("authors")).alias("authors"),
    trim(col("image")).alias("image"),
    trim(col("previewLink")).alias("preview_link"),
    trim(col("publisher")).alias("publisher"),
    trim(col("publishedDate")).alias("published_date"),
    trim(col("infoLink")).alias("info_link"),
    trim(col("categories")).alias("categories"),
    trim(col("ratingsCount")).cast("FLOAT").alias("ratings_count")
)

books_mapped_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|               title|         description|             authors|               image|        preview_link|           publisher|published_date|           info_link|          categories|ratings_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|Its Only Art If I...|                NULL|    ['Julie Strain']|http://books.goog...|http://books.goog...|                NULL|          1996|http://books.goog...|['Comics & Graphi...|         NULL|
|Dr. Seuss: Americ...|Philip Nel takes ...|      ['Philip Nel']|http://books.goog...|http://books.goog...|           A&C Black|    2005-01-01|http://books.goog...|['Biography & Aut...|         NULL|
|Wond

In [7]:
print(books_mapped_df.dtypes)

[('title', 'string'), ('description', 'string'), ('authors', 'string'), ('image', 'string'), ('preview_link', 'string'), ('publisher', 'string'), ('published_date', 'string'), ('info_link', 'string'), ('categories', 'string'), ('ratings_count', 'float')]


In [8]:
books_mapped_df = books_mapped_df.withColumn("categories",from_json(col("categories"), "array<string>"))

books_mapped_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|               title|         description|             authors|               image|        preview_link|           publisher|published_date|           info_link|          categories|ratings_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|Its Only Art If I...|                NULL|    ['Julie Strain']|http://books.goog...|http://books.goog...|                NULL|          1996|http://books.goog...|[Comics & Graphic...|         NULL|
|Dr. Seuss: Americ...|Philip Nel takes ...|      ['Philip Nel']|http://books.goog...|http://books.goog...|           A&C Black|    2005-01-01|http://books.goog...|[Biography & Auto...|         NULL|
|Wond

In [9]:
print(books_mapped_df.dtypes)

[('title', 'string'), ('description', 'string'), ('authors', 'string'), ('image', 'string'), ('preview_link', 'string'), ('publisher', 'string'), ('published_date', 'string'), ('info_link', 'string'), ('categories', 'array<string>'), ('ratings_count', 'float')]


In [10]:
books_mapped_df = books_mapped_df.withColumn("categories", explode(col("categories")))

# Remove leading/trailing whitespaces and convert category names to lowercase
books_mapped_df = books_mapped_df.withColumn("categories", trim(lower(col("categories"))))

books_mapped_df.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|               title|         description|             authors|               image|        preview_link|           publisher|published_date|           info_link|          categories|ratings_count|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------+--------------------+--------------------+-------------+
|Its Only Art If I...|                NULL|    ['Julie Strain']|http://books.goog...|http://books.goog...|                NULL|          1996|http://books.goog...|comics & graphic ...|         NULL|
|Dr. Seuss: Americ...|Philip Nel takes ...|      ['Philip Nel']|http://books.goog...|http://books.goog...|           A&C Black|    2005-01-01|http://books.goog...|biography & autob...|         NULL|
|Wond

In [11]:
books_mapped_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- authors: string (nullable = true)
 |-- image: string (nullable = true)
 |-- preview_link: string (nullable = true)
 |-- publisher: string (nullable = true)
 |-- published_date: string (nullable = true)
 |-- info_link: string (nullable = true)
 |-- categories: string (nullable = true)
 |-- ratings_count: float (nullable = true)



In [12]:
Books_rating_data_df = spark.read \
    .format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load("DE_CaseStudy_Dataset/Sales_Data/Books_rating.csv")

# Books_rating_data_df.count()

In [13]:
Books_rating_data_df.show()

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        Id|               Title|Price|       User_id|         profileName|review/helpfulness|review/score|review/time|      review/summary|         review/text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...|
|0826414346|Dr. Seuss: Ameri

In [14]:
# Books_rating_data_df.describe().show()

In [15]:
book_reviews_mapped_df = Books_rating_data_df.select(
    trim(col("Id")).alias("id"),
    trim(col("Title")).alias("title"),
    trim(col("Price")).alias("price"),
    trim(col("User_id")).alias("user_id"),
    trim(col("profileName")).alias("profile_name"),
    trim(col("review/helpfulness")).alias("review_helpfulness"),
    trim(col("review/score")).alias("review_score"),
    trim(col("review/time")).alias("review_time"),
    trim(col("review/summary")).alias("review_summary"),
    trim(col("review/text")).alias("review_text")
)

book_reviews_mapped_df.show()

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|        id|               title|price|       user_id|        profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...|
|0826414346|Dr. Seuss: Ameri

In [16]:
#divide data, 70% for training, 30% for testing
book_reviews_mapped_df_test = book_reviews_mapped_df.limit(1000)

In [17]:
# Get the mode value for the 'review_summary' column
mode_value = book_reviews_mapped_df_test.groupBy('review_summary').count().orderBy('count', ascending=False).first()[0]
print(f"Mode value for column 'review_summary': {mode_value}")

# Impute null values in the 'review_summary' column with the mode value
book_reviews_mapped_df_test = book_reviews_mapped_df_test.withColumn('review_summary', when(col('review_summary').isNull(), mode_value).otherwise(col('review_summary')))

Mode value for column 'review_summary': Cruel and Unusual


In [18]:
book_reviews_mapped_df_test = book_reviews_mapped_df_test.withColumn("label", (book_reviews_mapped_df_test["review_score"] >= 3).cast("integer"))

book_reviews_mapped_df_test.show()

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+
|        id|               title|price|       user_id|        profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+
|1882931173|Its Only Art If I...| NULL| AVCGYZL8FQQTD|Jim of Oz "jim-of...|               7/7|         4.0|  940636800|Nice collection o...|This is only for ...|    1|
|0826414346|Dr. Seuss: Americ...| NULL|A30TK6U7DNS82R|       Kevin Killian|             10/10|         5.0| 1095724800|   Really Enjoyed It|I don't care much...|    1|
|0826414346|Dr. Seuss: Americ...| NULL|A3UH4UZ4RSVO82|        John Granger|             10/11|         5.0| 1078790400|Essential for eve...|If people become ...

In [19]:
dividedData = book_reviews_mapped_df_test.randomSplit([0.7, 0.3]) 
trainingData = dividedData[0]
testingData = dividedData[1]
train_rows = trainingData.count()
test_rows = testingData.count()
# print ("Training data rows:", train_rows, "; Testing data rows:", test_rows)

In [20]:
tokenizer = Tokenizer(inputCol="review_summary", outputCol="review_summary_words")
tokenizedTrain = tokenizer.transform(trainingData)
tokenizedTrain.show(n=5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+
|        id|               title|price|       user_id|        profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|review_summary_words|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+
|0007106823|   Paolo Di Canio Hb| NULL|A1UPBP7ED9DTTY|Gavin O'brien "go...|               6/8|         5.0| 1108252800|There's Only One ...|As a Celtic and W...|    1|[there's, only, o...|
|0026204207|Foreignisms: A di...| NULL| AB1Z88ZD6NCB1|            CClio333|               0/0|         4.0| 1359244800|Good reference ma...|This is a great b...|    1|[good, reference,...|
|006000486X|Tess and the High...| NULL|          NULL| 

In [21]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), 
                       outputCol="review_summary_insightful_words")
SwRemovedTrain = swr.transform(tokenizedTrain)
SwRemovedTrain.show(n=5)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+
|        id|               title|price|       user_id|        profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|review_summary_words|review_summary_insightful_words|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+
|0007106823|   Paolo Di Canio Hb| NULL|A1UPBP7ED9DTTY|Gavin O'brien "go...|               6/8|         5.0| 1108252800|There's Only One ...|As a Celtic and W...|    1|[there's, only, o...|                   [one, paolo]|
|0026204207|Foreignisms: A di...| NULL| AB1Z88ZD6NCB1|            CClio333|               0/0|         4.0| 13592448

In [22]:
hashTF = HashingTF(inputCol=swr.getOutputCol(), outputCol="features")
numericTrainData = hashTF.transform(SwRemovedTrain)
numericTrainData.show(n=3)

+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+--------------------+
|        id|               title|price|       user_id|        profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|review_summary_words|review_summary_insightful_words|            features|
+----------+--------------------+-----+--------------+--------------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+--------------------+
|0007106823|   Paolo Di Canio Hb| NULL|A1UPBP7ED9DTTY|Gavin O'brien "go...|               6/8|         5.0| 1108252800|There's Only One ...|As a Celtic and W...|    1|[there's, only, o...|                   [one, paolo]|(262144,[7720,218...|
|0026204207|Foreignisms: A di...

In [23]:
numericTrainData = numericTrainData.cache()
lr = LogisticRegression(labelCol="label", featuresCol="features", 
                        maxIter=10, regParam=0.01)

model = lr.fit(numericTrainData)
print ("Training done")

Training done


In [24]:
tokenizedTest = tokenizer.transform(testingData)
SwRemovedTest = swr.transform(tokenizedTest)
numericTest = hashTF.transform(SwRemovedTest)
numericTest.show(n=2)

+----------+--------------------+-----+--------------+-------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+--------------------+
|        id|               title|price|       user_id| profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|review_summary_words|review_summary_insightful_words|            features|
+----------+--------------------+-----+--------------+-------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+--------------------+
|006000486X|Tess and the High...| NULL|A1IQK80SXVPAWW|atlantis_girl|               4/4|         5.0| 1089849600|Best Avon True Ro...|I loved everythin...|    1|[best, avon, true...|           [best, avon, true...|(262144,[117068,1...|
|006000486X|Tess and the High...| NULL|A1NZ6D6SD7B1AP|  C. M

In [25]:
prediction = model.transform(numericTest)
predictionFinal = prediction.select(
    "review_summary_insightful_words", "prediction", "Label")
predictionFinal.show(n=4, truncate = False)

prediction.show(n=4)

+-------------------------------+----------+-----+
|review_summary_insightful_words|prediction|Label|
+-------------------------------+----------+-----+
|[best, avon, true, romance.]   |1.0       |1    |
|[good, romance, teens]         |1.0       |1    |
|[wonderful, fun!]              |1.0       |1    |
|[good, little, critter, book]  |1.0       |0    |
+-------------------------------+----------+-----+
only showing top 4 rows

+----------+--------------------+-----+--------------+-------------+------------------+------------+-----------+--------------------+--------------------+-----+--------------------+-------------------------------+--------------------+--------------------+--------------------+----------+
|        id|               title|price|       user_id| profile_name|review_helpfulness|review_score|review_time|      review_summary|         review_text|label|review_summary_words|review_summary_insightful_words|            features|       rawPrediction|         probability|pre

In [26]:
correctPrediction = predictionFinal.filter(
    predictionFinal['prediction'] == predictionFinal['Label']).count()
totalData = predictionFinal.count()
print("correct prediction:", correctPrediction, ", total data:", totalData, 
      ", accuracy:", correctPrediction/totalData)

correct prediction: 258 , total data: 282 , accuracy: 0.9148936170212766


In [27]:
prediction.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- price: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- profile_name: string (nullable = true)
 |-- review_helpfulness: string (nullable = true)
 |-- review_score: string (nullable = true)
 |-- review_time: string (nullable = true)
 |-- review_summary: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- label: integer (nullable = true)
 |-- review_summary_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- review_summary_insightful_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [29]:
selected_columns = ["id", "title", "price", "user_id", "profile_name", "review_helpfulness",
                    "review_score", "review_time", "review_summary", "review_text",
                    "label", "review_summary_words", "review_summary_insightful_words",
                    "prediction"]


In [30]:
Books_rating_prediction = prediction \
    .withColumn("price", col("price").cast(DoubleType())) \
    .withColumn("review_score", col("review_score").cast(DoubleType())) \
    .withColumn("review_time", col("review_time").cast(IntegerType()))

In [31]:
jdbc_url = "jdbc:postgresql://localhost:5432/postgres"
Books_table_name = "ecomm.books"
Books_rating_table_name = "ecomm.book_reviews"
properties = {
    "user": "postgres",
    "password": "postgres",
    "driver": "org.postgresql.Driver"
}

# Write to PostgreSQL
books_mapped_df.write.jdbc(url=jdbc_url, table=Books_table_name, mode="overwrite", properties=properties)
Books_rating_prediction.select(selected_columns).write.jdbc(url=jdbc_url, table=Books_rating_table_name, mode="overwrite", properties=properties)