In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Proprocessing").config("spark.driver.memory","2g").config("spark.executor.memory","2g").getOrCreate()

25/08/02 19:04:20 WARN Utils: Your hostname, vaibhavi-HP-Laptop-15-fd0xxx resolves to a loopback address: 127.0.1.1; using 192.168.0.128 instead (on interface wlo1)
25/08/02 19:04:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/02 19:04:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/08/02 19:04:21 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.parquet("/home/vaibhavi/spark-ml-venv/ml_project/hybrid_book_recommender/data/final_books_data")

In [4]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- users_count: string (nullable = true)
 |-- pages: string (nullable = true)
 |-- description: string (nullable = true)
 |-- book_category_id: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- weighted_rating_value: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [5]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType, ArrayType, StringType,LongType

df = df.withColumn("id",col("id").cast(LongType()))\
       .withColumn("release_year", col("release_year").cast(IntegerType())) \
       .withColumn("reviews_count", col("reviews_count").cast(IntegerType())) \
       .withColumn("users_count", col("users_count").cast(IntegerType())) \
       .withColumn("pages", col("pages").cast(IntegerType())) \
       .withColumn("book_category_id", col("book_category_id").cast(IntegerType())) \
       .withColumn("weighted_rating_value", col("weighted_rating_value").cast(DoubleType())) \
       .withColumn("genres", col("genres").cast(ArrayType(StringType()))) \
       .withColumnRenamed("weighted_rating_value", "rating")


In [6]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- reviews_count: integer (nullable = true)
 |-- slug: string (nullable = true)
 |-- users_count: integer (nullable = true)
 |-- pages: integer (nullable = true)
 |-- description: string (nullable = true)
 |-- book_category_id: integer (nullable = true)
 |-- author_name: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [7]:
from pyspark.sql.functions import col, sum

def check_null(df):
# Replace `df` with your DataFrame
    df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]).show()


In [8]:
check_null(df)

                                                                                

+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+
| id|title|release_date|release_year|reviews_count|slug|users_count|pages|description|book_category_id|author_name|rating|genres|
+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+
|  0|    0|           0|           0|            0|   0|          0|    0|          0|               0|          0|     0|     0|
+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+



In [9]:
df2 = spark.read.parquet("/home/vaibhavi/spark-ml-venv/ml_project/data/data_modelling/output/clean_books.parquet")

In [10]:
df2.printSchema()

root
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- reviews_count: string (nullable = true)
 |-- slug: string (nullable = true)
 |-- users_count: string (nullable = true)
 |-- pages: string (nullable = true)
 |-- description: string (nullable = true)
 |-- book_category_id: string (nullable = true)
 |-- links: string (nullable = true)
 |-- author_name: string (nullable = true)
 |-- rating_count: long (nullable = true)
 |-- rating_value: double (nullable = true)
 |-- image_url: string (nullable = true)
 |-- image_color: string (nullable = true)
 |-- genre_tag: string (nullable = true)



In [39]:
# Difference: columns in df1 but not in df2
diff_cols = [col for col in df2.columns if col not in df.columns]
print("Columns in df2 but not in df:", diff_cols)


Columns in df2 but not in df: ['ratings_count', 'links', 'rating_count', 'rating_value', 'image_url', 'image_color', 'genre_tag']


In [23]:
df3 = df2.select("id","image_url","image_color")

In [24]:
df3.printSchema()

root
 |-- id: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- image_color: string (nullable = true)



In [25]:
df3 = df3.withColumn("id",col("id").cast(LongType()))

In [26]:
df3.printSchema()

root
 |-- id: long (nullable = true)
 |-- image_url: string (nullable = true)
 |-- image_color: string (nullable = true)



In [15]:
check_null(df3)



+----+---------+-----------+
|  id|image_url|image_color|
+----+---------+-----------+
|1131|        0|          0|
+----+---------+-----------+



                                                                                

In [27]:
df3.count()

3257144

In [28]:
df.count()

1302995

In [31]:
df3 = df3.dropna()

In [32]:
df3.count()

3256013

In [33]:
df3 = df3.select("id", "image_url", "image_color").dropDuplicates(["id"])


In [34]:
df3.count()

                                                                                

948358

In [35]:
df4 = df.join(df3, on="id", how="left")

In [36]:
df4.count()

1302995

In [37]:
check_null(df4)

[Stage 42:>                                                       (0 + 12) / 13]

+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+---------+-----------+
| id|title|release_date|release_year|reviews_count|slug|users_count|pages|description|book_category_id|author_name|rating|genres|image_url|image_color|
+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+---------+-----------+
|  0|    0|           0|           0|            0|   0|          0|    0|          0|               0|          0|     0|     0|        0|          0|
+---+-----+------------+------------+-------------+----+-----------+-----+-----------+----------------+-----------+------+------+---------+-----------+



                                                                                

In [38]:
df4.write.mode("overwrite").parquet("/home/vaibhavi/spark-ml-venv/ml_project/hybrid_book_recommender/data/books_df")


                                                                                

In [4]:
pca = spark.read.parquet("/home/vaibhavi/spark-ml-venv/ml_project/hybrid_book_recommender/data/pca_vectorized_df")

                                                                                

In [5]:
pca.printSchema()

root
 |-- id: long (nullable = true)
 |-- title: string (nullable = true)
 |-- pca_features: vector (nullable = true)

