# Clean Data Script

In [1]:
spark

# Import necessary packages and specify bucket and folder path

In [2]:
!pip install handyspark
!pip install gcsfs
!pip install openpyxl

Collecting handyspark
  Downloading handyspark-0.2.2a1-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting findspark (from handyspark)
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading handyspark-0.2.2a1-py2.py3-none-any.whl (39 kB)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark, handyspark
Successfully installed findspark-2.0.1 handyspark-0.2.2a1


[0mCollecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
[0m

In [3]:
import numpy as np
import gcsfs
from google.cloud import storage
import os
from pyspark.sql.functions import to_date, rand, col, sum, when, to_timestamp, trim
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType, TimestampType, ArrayType
np.bool = np.bool_

In [4]:
from handyspark import *

In [5]:
# GCS bucket and folder
bucket_name = "goodreads_bucket"
folder_path = "landing/"
save_path = "cleaned/"
SEED = 645
SAMPLE_FRAC=0.01

# Create schemas before reading the data

In [6]:
# Reviews schema

reviews_schema = StructType([
    StructField("book_id", StringType(), nullable=True),
    StructField("date_added", StringType(), nullable=True),
    StructField("date_updated", StringType(), nullable=True),
    StructField("n_comments", IntegerType(), nullable=True),
    StructField("n_votes", IntegerType(), nullable=True),
    StructField("rating", IntegerType(), nullable=True),
    StructField("read_at", StringType(), nullable=True),
    StructField("review_id", StringType(), nullable=True),
    StructField("review_text", StringType(), nullable=True),
    StructField("started_at", StringType(), nullable=True),
    StructField("user_id", StringType(), nullable=True)
])

In [7]:
# Books schema

# Define the schema for the nested structure
authors_schema = StructType([
    StructField("author_id", StringType(), nullable=True),
    StructField("role", StringType(), nullable=True)
])

popular_shelves_schema = StructType([
    StructField("count", StringType(), nullable=True),
    StructField("name", StringType(), nullable=True)
])

# Main schema
books_schema = StructType([
    StructField("asin", StringType(), nullable=True),
    StructField("authors", ArrayType(authors_schema), nullable=True),
    StructField("average_rating", StringType(), nullable=True),
    StructField("book_id", StringType(), nullable=True),
    StructField("country_code", StringType(), nullable=True),
    StructField("description", StringType(), nullable=True),
    StructField("edition_information", StringType(), nullable=True),
    StructField("format", StringType(), nullable=True),
    StructField("image_url", StringType(), nullable=True),
    StructField("is_ebook", IntegerType(), nullable=True),
    StructField("isbn", StringType(), nullable=True),
    StructField("isbn13", StringType(), nullable=True),
    StructField("kindle_asin", StringType(), nullable=True),
    StructField("language_code", StringType(), nullable=True),
    StructField("link", StringType(), nullable=True),
    StructField("num_pages", StringType(), nullable=True),
    StructField("popular_shelves", ArrayType(popular_shelves_schema), nullable=True),
    StructField("publication_day", StringType(), nullable=True),
    StructField("publication_month", StringType(), nullable=True),
    StructField("publication_year", StringType(), nullable=True),
    StructField("publisher", StringType(), nullable=True),
    StructField("ratings_count", StringType(), nullable=True),
    StructField("series", ArrayType(StringType(), containsNull=True), nullable=True),
    StructField("similar_books", ArrayType(StringType(), containsNull=True), nullable=True),
    StructField("text_reviews_count", StringType(), nullable=True),
    StructField("title", StringType(), nullable=True),
    StructField("title_without_series", StringType(), nullable=True),
    StructField("url", StringType(), nullable=True),
    StructField("work_id", StringType(), nullable=True)
])

In [8]:
# Interactions schema

interactions_schema = StructType([
    StructField("user_id", StringType(), nullable=True),
    StructField("book_id", IntegerType(), nullable=True),
    StructField("is_read", IntegerType(), nullable=True),
    StructField("rating", IntegerType(), nullable=True),
    StructField("is_reviewed", IntegerType(), nullable=True)
])

In [9]:
# Author schema

author_schema = StructType([
    StructField("author_id", StringType(), nullable=True), 
    StructField("average_rating", StringType(), nullable=True), 
    StructField("name", StringType(), nullable=True), 
    StructField("ratings_count", StringType(), nullable=True),
    StructField("text_reviews_count", StringType(), nullable=True)
])

In [10]:
# Genre schema

genre_schema = StructType([
    StructField("book_id", StringType(), True),  # Book ID, nullable
    StructField("genres", StructType([  # Nested struct for genres
        StructField("children", LongType(), True),
        StructField("comics, graphic", LongType(), True),
        StructField("fantasy, paranormal", LongType(), True),
        StructField("fiction", LongType(), True),
        StructField("history, historical fiction, biography", LongType(), True),
        StructField("mystery, thriller, crime", LongType(), True),
        StructField("non-fiction", LongType(), True),
        StructField("poetry", LongType(), True),
        StructField("romance", LongType(), True),
        StructField("young-adult", LongType(), True)
    ]), True)  # Allow the genres struct to be nullable
])


# Read all our data into spark dataframes
#### Files we want to get into datframes from the landing folder of our bucket
 - goodreads_book_authors.json
 - goodreads_book_genres_initial.json
 - goodreads_reviews_dedup.json
 - goodreads_books.json
 - goodreads_interactions.csv

In [11]:
# goodreads_book_authors.json

file_name = "goodreads_book_authors.json"

# Path to the JSON files in the bucket
gcs_path = f"gs://{bucket_name}/{folder_path}/{file_name}"

# Read JSON files from GCS
authors_df = spark.read.schema(author_schema).json(gcs_path)#.sample(withReplacement=False, fraction=SAMPLE_FRAC, seed=SEED)

# Replace empty strings with nulls in all columns
authors_df = authors_df.replace("", None)

# Show the data
authors_df.show(10)

# Print schema of the JSON
authors_df.printSchema()

                                                                                

+---------+--------------+-------------------+-------------+------------------+
|author_id|average_rating|               name|ratings_count|text_reviews_count|
+---------+--------------+-------------------+-------------+------------------+
|   604031|          3.98|   Ronald J. Fields|           49|                 7|
|   626222|          4.08|      Anita Diamant|       546796|             28716|
|    10333|          3.92|     Barbara Hambly|       122118|              5075|
|     9212|          3.68|    Jennifer Weiner|       888522|             36262|
|   149918|          3.82|      Nigel Pennick|         1740|                96|
|  3041852|          3.89|   Alfred J. Church|          947|                85|
|   215594|          4.17| Michael Halberstam|           23|                 6|
|    19158|          4.18|     Rachel Roberts|        13677|               486|
|  5807700|          3.99|         V.L. Locey|         3130|               986|
|  2983296|          3.48|Anton Szandor 

In [12]:
# goodreads_book_genres_initial.json

file_name = "goodreads_book_genres_initial.json"

# Path to the JSON files in the bucket
gcs_path = f"gs://{bucket_name}/{folder_path}/{file_name}"

# Read JSON files from GCS
genre_df = spark.read.schema(genre_schema).json(gcs_path)#.sample(withReplacement=False, fraction=SAMPLE_FRAC, seed=SEED)

# Replace empty strings with nulls in all columns
genre_df = genre_df.replace("", None)

# Show the data
genre_df.show(10)

# Print schema of the JSON
genre_df.printSchema()

+--------+--------------------+
| book_id|              genres|
+--------+--------------------+
| 5333265|{NULL, NULL, NULL...|
| 1333909|{NULL, NULL, NULL...|
| 7327624|{NULL, NULL, 31, ...|
| 6066819|{NULL, NULL, NULL...|
|  287140|{NULL, NULL, NULL...|
|  287141|{6, NULL, 1, 1, 9...|
|  378460|{NULL, NULL, NULL...|
| 6066812|{16, NULL, 32, 7,...|
|34883016|{NULL, NULL, NULL...|
|  287149|{NULL, NULL, NULL...|
+--------+--------------------+
only showing top 10 rows

root
 |-- book_id: string (nullable = true)
 |-- genres: struct (nullable = true)
 |    |-- children: long (nullable = true)
 |    |-- comics, graphic: long (nullable = true)
 |    |-- fantasy, paranormal: long (nullable = true)
 |    |-- fiction: long (nullable = true)
 |    |-- history, historical fiction, biography: long (nullable = true)
 |    |-- mystery, thriller, crime: long (nullable = true)
 |    |-- non-fiction: long (nullable = true)
 |    |-- poetry: long (nullable = true)
 |    |-- romance: long (nullable = 

In [13]:
# goodreads_reviews_dedup.json

file_name = "goodreads_reviews_dedup.json"

# Path to the JSON files in the bucket
gcs_path = f"gs://{bucket_name}/{folder_path}/{file_name}"

# Read JSON files from GCS
reviews_df = spark.read.schema(reviews_schema).json(gcs_path)#.sample(withReplacement=False, fraction=SAMPLE_FRAC, seed=SEED)

# Replace empty strings with nulls in all columns
reviews_df = reviews_df.replace("", None)

# Show the data
reviews_df.show(10)

# Print schema of the JSON
reviews_df.printSchema()

+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
| book_id|          date_added|        date_updated|n_comments|n_votes|rating|             read_at|           review_id|         review_text|          started_at|             user_id|
+--------+--------------------+--------------------+----------+-------+------+--------------------+--------------------+--------------------+--------------------+--------------------+
|24375664|Fri Aug 25 13:55:...|Mon Oct 09 08:55:...|         0|     16|     5|Sat Oct 07 00:00:...|5cd416f3efc3f944f...|Mind blowingly co...|Sat Aug 26 00:00:...|8842281e1d1347389...|
|18245960|Sun Jul 30 07:44:...|Wed Aug 30 00:00:...|         1|     28|     5|Sat Aug 26 12:05:...|dfdbb7b0eb5a7e4c2...|This is a special...|Tue Aug 15 13:23:...|8842281e1d1347389...|
| 6392944|Mon Jul 24 02:48:...|Sun Jul 30 09:28:...|         0|      6|     3|Tu

In [14]:
# goodreads_books.json

file_name = "goodreads_books.json"

# Path to the JSON files in the bucket
gcs_path = f"gs://{bucket_name}/{folder_path}/{file_name}"

# Read JSON files from GCS
books_df = spark.read.schema(books_schema).json(gcs_path)#.sample(withReplacement=False, fraction=SAMPLE_FRAC, seed=SEED)

# Replace empty strings with nulls in all columns
books_df = books_df.replace("", None)

# Print schema of the JSON
books_df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- authors: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- author_id: string (nullable = true)
 |    |    |-- role: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- edition_information: string (nullable = true)
 |-- format: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_ebook: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: string (nullable = true)
 |-- kindle_asin: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- link: string (nullable = true)
 |-- num_pages: string (nullable = true)
 |-- popular_shelves: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- count: string (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- pu

In [15]:
# goodreads_interactions.csv

file_name = "goodreads_interactions.csv"

# Path to the JSON files in the bucket
gcs_path = f"gs://{bucket_name}/{folder_path}/{file_name}"

# Read CSV file from GCS with header option
interactions_df = spark.read.option("header", "true").csv(gcs_path)#.sample(withReplacement=False, fraction=SAMPLE_FRAC, seed=SEED)

# Replace empty strings with nulls in all columns
interactions_df = interactions_df.replace("", None)

# Show the data
interactions_df.orderBy(rand()).limit(10).show(10)

# Print schema
interactions_df.printSchema()



+-------+-------+-------+------+-----------+
|user_id|book_id|is_read|rating|is_reviewed|
+-------+-------+-------+------+-----------+
| 437231| 487488|      0|     0|          0|
| 516185|    670|      1|     5|          0|
| 153514|    577|      1|     5|          0|
|  54780| 151816|      0|     0|          0|
| 389284| 529441|      0|     0|          0|
| 356244|  15268|      1|     5|          0|
| 153342|   2766|      0|     0|          0|
| 361960| 201881|      0|     0|          0|
|  81753| 113006|      1|     3|          0|
| 132349| 633124|      1|     4|          0|
+-------+-------+-------+------+-----------+

root
 |-- user_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- is_read: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- is_reviewed: string (nullable = true)



                                                                                

# Functions to assist in cleaning process

In [16]:
#function takes in a spark dataframe, then drops nulls in ALL rows. Accepts a list of columns to skip in this process
#This function is meant for dfs with a large amount of columns with small amount of nulls in each column.
#
#BEFORE USING THIS FUNCTION, DROP COLUMNS THAT HAVE A HIGH PERCENTAGE OF NULLS
#OTHERWISE THE WHOLE DF WILL PRACTICALLY BE DROPPED.

def drop_nulls_in_rows(df, list_of_columns_to_ignore=None):
    #get a extremely small sample just so we can find the rows we are dealing with using pandas
    sample_sdf = df.limit(1)
    pandas_df = sample_sdf.toPandas()
 
    #Now we get our list of columns that we will drop nulls in
    columns = pandas_df.columns
    
    #If any value of the list_of_columns_to_ignore is in the columns, we take that column out.
    if list_of_columns_to_ignore is not None:
        # Filter out the columns that are to be ignored
        columns = [col for col in pandas_df.columns if col not in list_of_columns_to_ignore]
    
    
    #With the columns extracted, loop through every column and filter it so we dont get nulls.
    for column in columns:
        print("Removing nulls in "+column+"...")
        df = df.filter(column+" is not NULL")
    
    return df

In [17]:
#function that drops a list of columns in a pyspark df
def drop_columns(df, list_of_columns):
    for column in list_of_columns:
        print("Removing column "+column+"...")
        df = df.drop(column)
    
    return df

# Checking NULLS

In [18]:
# Small function we will use to count nulls
def count_nulls(sdf):
    hsdf = HandyFrame(sdf)
    print(hsdf.isnull())

In [19]:
# Count nulls in each row, so we can eventually drop them
count_nulls(reviews_df)



book_id               0
date_added            0
date_updated          0
n_comments            0
n_votes               0
rating                0
read_at         2766813
review_id             0
review_text        6938
started_at      6712475
user_id               0
Name: missing, dtype: int64


                                                                                

In [20]:
# Count rows 
reviews_df_count = reviews_df.count()
print("Count of reviews_df: "+str(reviews_df_count))



Count of reviews_df: 15739967


                                                                                

In [21]:
# Count nulls in each row, so we can eventually drop them
count_nulls(books_df)

# Columns that must be transformed
#
# book_id

24/12/10 09:54:23 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

asin                    1891138
authors                       0
average_rating              524
book_id                       0
country_code                490
description              412233
edition_information     2142642
format                   646754
image_url                   490
is_ebook                2360655
isbn                     983373
isbn13                   780263
kindle_asin             1345725
language_code           1060153
link                        524
num_pages                764133
popular_shelves               0
publication_day         1024429
publication_month        882945
publication_year         599625
publisher                654362
ratings_count               524
series                        0
similar_books                 0
text_reviews_count          524
title                         7
title_without_series          7
url                         524
work_id                     524
Name: missing, dtype: int64


                                                                                

In [22]:
# Count rows 
books_df_count = books_df.count()
print("Count of books_df: "+str(books_df_count))



Count of books_df: 2360655


                                                                                

In [23]:
# Count nulls in each row, so we can eventually drop them
count_nulls(interactions_df)

# Columns that must be transformed
#
# None!



user_id        0
book_id        0
is_read        0
rating         0
is_reviewed    0
Name: missing, dtype: int64


                                                                                

In [24]:
# Count rows 
interactions_df_count = interactions_df.count()
print("Count of interactions_df: "+str(interactions_df_count))



Count of interactions_df: 228648342


                                                                                

In [25]:
count_nulls(authors_df)



author_id             0
average_rating        0
name                  5
ratings_count         0
text_reviews_count    0
Name: missing, dtype: int64




In [26]:
# Count rows 
authors_df_count = authors_df.count()
print("Count of authors_df: "+str(authors_df_count))

Count of authors_df: 829529




In [27]:
count_nulls(genre_df)



book_id    0
genres     0
Name: missing, dtype: int64


                                                                                

In [28]:
# Count rows 
genre_df_count = genre_df.count()
print("Count of genre_df: "+str(genre_df_count))

[Stage 32:>                                                       (0 + 12) / 12]

Count of genre_df: 2360655


                                                                                

# Reformatting

In [29]:
# Changes a list of columns to a certain datatype. This is for PySpark dataframes.
# You give it... 
#  1. The dataframe you want to modify
#  2. The list of strings for the name of columns you want to be modified
#  3. The datetype you want to modify them to

def change_cols_to_type(df, list_of_columns, data_type):
    for column in list_of_columns:
        df = df.withColumn(column, df[column].cast(data_type))
    return df

In [30]:
# For books_df, change all these columns below to their respective datatypes.
#
# average_rating : float         
# book_id : integer
# is_ebook : integer
# isbin : integer
# isbin13 : integer
# num_pages : integer
# publication_day : integer
# publication_month : integer
# publication_year : integer
# ratings_count : integer
# text_reviews_count : integer

# For reviews_df, change all these columns below to their respective datatypes.
#
# book_id : integer

In [31]:
#Change book cols to a more appropriate datatype

#Columns to change
books_cols_to_int = ["average_rating", "book_id", "isbn", "num_pages", "publication_day",
                    "publication_month","publication_year","ratings_count","text_reviews_count"]
books_cols_to_long = ["isbn13"]
books_cols_to_float = ["average_rating"]

#Change columns
books_df = change_cols_to_type(books_df, books_cols_to_int, "integer")
books_df = change_cols_to_type(books_df, books_cols_to_long, "long")
books_df = change_cols_to_type(books_df, books_cols_to_float, "float")

#Count nulls to make sure nothing wrong happened
count_nulls(books_df)

#Show certain book columns after changing their datatypes
books_df.select("isbn","isbn13", "is_ebook").limit(10).show(truncate=False)

                                                                                

asin                    1891138
authors                       0
average_rating              524
book_id                       0
country_code                490
description              412233
edition_information     2142642
format                   646754
image_url                   490
is_ebook                2360655
isbn                    1307129
isbn13                   783792
kindle_asin             1345725
language_code           1060153
link                        524
num_pages                764133
popular_shelves               0
publication_day         1024429
publication_month        882945
publication_year         599625
publisher                654362
ratings_count               524
series                        0
similar_books                 0
text_reviews_count          524
title                         7
title_without_series          7
url                         524
work_id                     524
Name: missing, dtype: int64
+----------+-------------+--------+
|isbn   

In [32]:
#Change review cols to a more appropriate datatype

#Columns to change
reviews_cols_to_int = ["book_id"]

#Change columns
reviews_df = change_cols_to_type(reviews_df, reviews_cols_to_int, "integer")

#Count nulls to make sure nothing wrong happened
count_nulls(reviews_df)



book_id               0
date_added            0
date_updated          0
n_comments            0
n_votes               0
rating                0
read_at         2766813
review_id             0
review_text        6938
started_at      6712475
user_id               0
Name: missing, dtype: int64


                                                                                

In [33]:
# int float string int int
# |-- author_id: string (nullable = true)
# |-- average_rating: string (nullable = true)
# |-- name: string (nullable = true)
# |-- ratings_count: string (nullable = true)
# |-- text_reviews_count: string (nullable = true)

auth_cols_int = ["author_id", "ratings_count", "text_reviews_count"]
auth_cols_float = ["average_rating"]

authors_df = change_cols_to_type(authors_df, auth_cols_int, "integer")
authors_df = change_cols_to_type(authors_df, auth_cols_float, "float")

In [34]:
# Flattening the genres dataframe
flat = genre_df.select(
    col("book_id"),
    col("genres.children").alias("children"),
    col("genres.comics, graphic").alias("comics_graphic"),
    col("genres.fantasy, paranormal").alias("fantasy_paranormal"),
    col("genres.fiction").alias("fiction"),
    col("genres.history, historical fiction, biography").alias("history"),
    col("genres.mystery, thriller, crime").alias("mystery"),
    col("genres.non-fiction").alias("non_fiction"),
    col("genres.poetry").alias("poetry"),
    col("genres.romance").alias("romance"),
    col("genres.young-adult").alias("young_adult")
).fillna(0)
flat.show()
genre_df = flat
del flat

+--------+--------+--------------+------------------+-------+-------+-------+-----------+------+-------+-----------+
| book_id|children|comics_graphic|fantasy_paranormal|fiction|history|mystery|non_fiction|poetry|romance|young_adult|
+--------+--------+--------------+------------------+-------+-------+-------+-----------+------+-------+-----------+
| 5333265|       0|             0|                 0|      0|      1|      0|          0|     0|      0|          0|
| 1333909|       0|             0|                 0|    219|      5|      0|          0|     0|      0|          0|
| 7327624|       0|             0|                31|      8|      0|      1|          0|     1|      0|          0|
| 6066819|       0|             0|                 0|    555|      0|     10|          0|     0|     23|          0|
|  287140|       0|             0|                 0|      0|      0|      0|          3|     0|      0|          0|
|  287141|       6|             0|                 1|      1|   

# Begin cleaning process

# Change/Create columns to datetime

In [35]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

# Define the format of the date strings
date_format = "EEE MMM dd HH:mm:ss Z yyyy"

# Apply transformations directly to `reviews_df`
reviews_df = reviews_df \
    .withColumn("date_added_fix", to_timestamp(col("date_added"), date_format)) \
    .withColumn("date_updated_fix", to_timestamp(col("date_updated"), date_format)) \
    .withColumn("started_at_fix", to_timestamp(col("started_at"), date_format)) \
    .withColumn("read_at_fix", to_timestamp(col("read_at"), date_format))

# Show old columns
reviews_df.select("date_added", "date_updated", "started_at", "read_at").limit(10).show(truncate=False)

+------------------------------+------------------------------+------------------------------+------------------------------+
|date_added                    |date_updated                  |started_at                    |read_at                       |
+------------------------------+------------------------------+------------------------------+------------------------------+
|Fri Aug 25 13:55:02 -0700 2017|Mon Oct 09 08:55:59 -0700 2017|Sat Aug 26 00:00:00 -0700 2017|Sat Oct 07 00:00:00 -0700 2017|
|Sun Jul 30 07:44:10 -0700 2017|Wed Aug 30 00:00:26 -0700 2017|Tue Aug 15 13:23:18 -0700 2017|Sat Aug 26 12:05:52 -0700 2017|
|Mon Jul 24 02:48:17 -0700 2017|Sun Jul 30 09:28:03 -0700 2017|Mon Jul 24 00:00:00 -0700 2017|Tue Jul 25 00:00:00 -0700 2017|
|Mon Jul 24 02:33:09 -0700 2017|Sun Jul 30 10:23:54 -0700 2017|Tue Jul 25 00:00:00 -0700 2017|Sun Jul 30 15:42:05 -0700 2017|
|Mon Jul 24 02:28:14 -0700 2017|Thu Aug 24 00:07:20 -0700 2017|Sun Jul 30 00:00:00 -0700 2017|Sat Aug 05 00:00:00 -070

In [36]:
reviews_df = reviews_df.drop("date_added", "date_updated", "started_at", "read_at")
reviews_df = reviews_df.withColumnRenamed("date_added_fix", "date_added")
reviews_df = reviews_df.withColumnRenamed("date_updated_fix", "date_updated")
reviews_df = reviews_df.withColumnRenamed("started_at_fix", "started_at")
reviews_df = reviews_df.withColumnRenamed("read_at_fix", "read_at")

In [37]:
# Show the updated columns
reviews_df.select("date_added", "date_updated", "started_at", "read_at").limit(10).show(truncate=False)

+-------------------+-------------------+-------------------+-------------------+
|date_added         |date_updated       |started_at         |read_at            |
+-------------------+-------------------+-------------------+-------------------+
|2017-08-25 20:55:02|2017-10-09 15:55:59|2017-08-26 07:00:00|2017-10-07 07:00:00|
|2017-07-30 14:44:10|2017-08-30 07:00:26|2017-08-15 20:23:18|2017-08-26 19:05:52|
|2017-07-24 09:48:17|2017-07-30 16:28:03|2017-07-24 07:00:00|2017-07-25 07:00:00|
|2017-07-24 09:33:09|2017-07-30 17:23:54|2017-07-25 07:00:00|2017-07-30 22:42:05|
|2017-07-24 09:28:14|2017-08-24 07:07:20|2017-07-30 07:00:00|2017-08-05 07:00:00|
|2017-03-06 15:14:44|2017-03-06 15:15:21|NULL               |NULL               |
|2017-03-01 01:52:08|2017-05-26 19:04:23|2017-05-05 07:00:00|2017-05-26 22:17:45|
|2017-01-08 23:51:29|2017-03-07 09:09:18|2017-01-13 08:00:00|2017-02-20 08:00:00|
|2016-12-15 18:51:26|2017-03-13 06:33:51|2017-03-01 01:55:35|2017-03-09 23:34:06|
|2016-12-12 00:1

# Save Data

In [38]:
def write_parquet_to_gcs(df, folder_name):
    
    client = storage.Client()

    # Specify the path to the "cleaned" folder in the bucket
    _folder_path = f"{save_path}{folder_name}"

    # Get the bucket and check if the folder already exists
    bucket = client.get_bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=_folder_path)

    # Check if folder exists by listing the files
    folder_exists = any(blob.name.startswith(_folder_path) for blob in blobs)

    if not folder_exists:
        print(f"Folder {_folder_path} does not exist. Creating and writing Parquet.")
    else:
        print(f"Folder {_folder_path} already exists. Overwriting content.")

    # Define the GCS path where the Parquet files will be written
    gcs_path = f"gs://{bucket_name}/{_folder_path}/"

    # Write the DataFrame to Parquet in the specified folder
    df.write.parquet(gcs_path, mode='overwrite')  # Use 'overwrite' to write or replace existing content

    print(f"Data written to Parquet in folder: {_folder_path}")

In [42]:
#This is for the next few cells. If this option is True, then we will enable saving.
#This allows for when we 'reset and run all' and want the option to not save when debugging.
ENABLE_SAVE = True

In [47]:
if ENABLE_SAVE:
    write_parquet_to_gcs(reviews_df, "reviews")

Folder cleaned/reviews already exists. Overwriting content.


24/12/10 10:16:20 WARN TaskSetManager: Lost task 123.0 in stage 52.0 (TID 1382) (cluster-20241210d-highperformance-deletefast-w-0.us-central1-f.c.cis4400-individual-project.internal executor 20): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to gs://goodreads_bucket/cleaned/reviews.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(

Py4JJavaError: An error occurred while calling o925.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 123 in stage 52.0 failed 4 times, most recent failure: Lost task 123.3 in stage 52.0 (TID 1386) (cluster-20241210d-highperformance-deletefast-w-2.us-central1-f.c.cis4400-individual-project.internal executor 19): org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to gs://goodreads_bucket/cleaned/reviews.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.WRITE_ANCIENT_DATETIME] You may get a different result due to the upgrading to Spark >= 3.0:
writing dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z
into Parquet INT96 files can be dangerous, as the files may be read by Spark 2.x
or legacy versions of Hive later, which uses a legacy hybrid calendar that
is different from Spark 3.0+'s Proleptic Gregorian calendar. See more
details in SPARK-31404. You can set "spark.sql.parquet.int96RebaseModeInWrite" to "LEGACY" to rebase the
datetime values w.r.t. the calendar difference during writing, to get maximum
interoperability. Or set the config to "CORRECTED" to write the datetime
values as it is, if you are sure that the written files will only be read by
Spark 3.0+ or other systems that use Proleptic Gregorian calendar.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.sparkUpgradeInWritingDatesError(QueryExecutionErrors.scala:759)
	at org.apache.spark.sql.execution.datasources.DataSourceUtils$.newRebaseExceptionInWrite(DataSourceUtils.scala:187)
	at org.apache.spark.sql.execution.datasources.DataSourceUtils$.$anonfun$createTimestampRebaseFuncInWrite$1(DataSourceUtils.scala:232)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$makeWriter$10(ParquetWriteSupport.scala:211)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$makeWriter$10$adapted(ParquetWriteSupport.scala:210)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$writeFields$1(ParquetWriteSupport.scala:161)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.consumeField(ParquetWriteSupport.scala:483)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.writeFields(ParquetWriteSupport.scala:161)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$write$1(ParquetWriteSupport.scala:151)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.consumeMessage(ParquetWriteSupport.scala:471)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.write(ParquetWriteSupport.scala:151)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.write(ParquetWriteSupport.scala:53)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:138)
	at org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:181)
	at org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:43)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.write(ParquetOutputWriter.scala:39)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.write(FileFormatDataWriter.scala:175)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithMetrics(FileFormatDataWriter.scala:85)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:92)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:476)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1399)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:483)
	... 17 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2457)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:380)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:329)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:377)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:197)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:473)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:473)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:449)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:859)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:388)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:361)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:240)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:792)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: org.apache.spark.SparkException: [TASK_WRITE_FAILED] Task failed while writing rows to gs://goodreads_bucket/cleaned/reviews.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.taskFailedWhileWritingRowsError(QueryExecutionErrors.scala:774)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:493)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:96)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more
Caused by: org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.WRITE_ANCIENT_DATETIME] You may get a different result due to the upgrading to Spark >= 3.0:
writing dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z
into Parquet INT96 files can be dangerous, as the files may be read by Spark 2.x
or legacy versions of Hive later, which uses a legacy hybrid calendar that
is different from Spark 3.0+'s Proleptic Gregorian calendar. See more
details in SPARK-31404. You can set "spark.sql.parquet.int96RebaseModeInWrite" to "LEGACY" to rebase the
datetime values w.r.t. the calendar difference during writing, to get maximum
interoperability. Or set the config to "CORRECTED" to write the datetime
values as it is, if you are sure that the written files will only be read by
Spark 3.0+ or other systems that use Proleptic Gregorian calendar.
	at org.apache.spark.sql.errors.QueryExecutionErrors$.sparkUpgradeInWritingDatesError(QueryExecutionErrors.scala:759)
	at org.apache.spark.sql.execution.datasources.DataSourceUtils$.newRebaseExceptionInWrite(DataSourceUtils.scala:187)
	at org.apache.spark.sql.execution.datasources.DataSourceUtils$.$anonfun$createTimestampRebaseFuncInWrite$1(DataSourceUtils.scala:232)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$makeWriter$10(ParquetWriteSupport.scala:211)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$makeWriter$10$adapted(ParquetWriteSupport.scala:210)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$writeFields$1(ParquetWriteSupport.scala:161)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.consumeField(ParquetWriteSupport.scala:483)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.writeFields(ParquetWriteSupport.scala:161)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.$anonfun$write$1(ParquetWriteSupport.scala:151)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.consumeMessage(ParquetWriteSupport.scala:471)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.write(ParquetWriteSupport.scala:151)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport.write(ParquetWriteSupport.scala:53)
	at org.apache.parquet.hadoop.InternalParquetRecordWriter.write(InternalParquetRecordWriter.java:138)
	at org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:181)
	at org.apache.parquet.hadoop.ParquetRecordWriter.write(ParquetRecordWriter.java:43)
	at org.apache.spark.sql.execution.datasources.parquet.ParquetOutputWriter.write(ParquetOutputWriter.scala:39)
	at org.apache.spark.sql.execution.datasources.SingleDirectoryDataWriter.write(FileFormatDataWriter.scala:175)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithMetrics(FileFormatDataWriter.scala:85)
	at org.apache.spark.sql.execution.datasources.FileFormatDataWriter.writeWithIterator(FileFormatDataWriter.scala:92)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeTask$1(FileFormatWriter.scala:476)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1399)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:483)
	... 17 more


In [43]:
if ENABLE_SAVE:
    write_parquet_to_gcs(books_df, "books")

Folder cleaned/books does not exist. Creating and writing Parquet.


                                                                                

Data written to Parquet in folder: cleaned/books


In [44]:
if ENABLE_SAVE:
    write_parquet_to_gcs(interactions_df, "interactions")

Folder cleaned/interactions does not exist. Creating and writing Parquet.


                                                                                

Data written to Parquet in folder: cleaned/interactions


In [45]:
if ENABLE_SAVE:
    write_parquet_to_gcs(authors_df, "authors")

Folder cleaned/authors does not exist. Creating and writing Parquet.


                                                                                

Data written to Parquet in folder: cleaned/authors


In [46]:
if ENABLE_SAVE:
    write_parquet_to_gcs(genre_df, "genre")

Folder cleaned/genre does not exist. Creating and writing Parquet.


                                                                                

Data written to Parquet in folder: cleaned/genre
