# Extract data and load it into bucket'
- Much of the code here comes from https://github.com/MengtingWan/goodreads/blob/master/download.ipynb. The code here has been modified to fit the purpose of this project
- Credit goes to Mengting Wan from https://github.com/MengtingWan for providing the data and reference jupyter notebook files to examine and download the data

In [1]:
import pandas as pd
import gzip
import requests
from google.cloud import storage
import os
import shutil

**Specify your directory and bucket here:**

In [2]:
DIR = './home'
bucket_name = 'goodreads_bucket'

**Load data types and names**

In [13]:
# Define the data as a list of tuples (type, name)
data = [
    ("complete", "goodreads_book_works.json.gz"),
    ("complete", "goodreads_book_authors.json.gz"),
    ("complete", "goodreads_book_series.json.gz"),
    ("complete", "goodreads_books.json.gz"),
    ("complete", "goodreads_book_genres_initial.json.gz"),
    ("byGenre", "goodreads_books_children.json.gz"),
    ("byGenre", "goodreads_books_comics_graphic.json.gz"),
    ("byGenre", "goodreads_books_fantasy_paranormal.json.gz"),
    ("byGenre", "goodreads_books_history_biography.json.gz"),
    ("byGenre", "goodreads_books_mystery_thriller_crime.json.gz"),
    ("byGenre", "goodreads_books_poetry.json.gz"),
    ("byGenre", "goodreads_books_romance.json.gz"),
    ("byGenre", "goodreads_books_young_adult.json.gz"),
    ("byGenre", "goodreads_interactions_children.json.gz"),
    ("byGenre", "goodreads_interactions_comics_graphic.json.gz"),
    ("byGenre", "goodreads_interactions_fantasy_paranormal.json.gz"),
    ("byGenre", "goodreads_interactions_history_biography.json.gz"),
    ("byGenre", "goodreads_interactions_mystery_thriller_crime.json.gz"),
    ("byGenre", "goodreads_interactions_poetry.json.gz"),
    ("byGenre", "goodreads_interactions_romance.json.gz"),
    ("byGenre", "goodreads_interactions_young_adult.json.gz"),
    ("byGenre", "goodreads_reviews_children.json.gz"),
    ("byGenre", "goodreads_reviews_comics_graphic.json.gz"),
    ("byGenre", "goodreads_reviews_fantasy_paranormal.json.gz"),
    ("byGenre", "goodreads_reviews_history_biography.json.gz"),
    ("byGenre", "goodreads_reviews_mystery_thriller_crime.json.gz"),
    ("byGenre", "goodreads_reviews_poetry.json.gz"),
    ("byGenre", "goodreads_reviews_romance.json.gz"),
    ("byGenre", "goodreads_reviews_young_adult.json.gz"),
    ("complete", "book_id_map.csv"),
    ("complete", "user_id_map.csv"),
    ("complete", "goodreads_interactions.csv"),
    ("complete", "goodreads_reviews_dedup.json.gz"),
    ("complete", "goodreads_reviews_spoiler.json.gz"),
    ("complete", "goodreads_reviews_spoiler_raw.json.gz"),|
    ("complete", "goodreads_interactions_dedup.json.gz")
]

# Create the DataFrame
file_names = pd.DataFrame(data, columns=["type", "name"])

# Display the DataFrame
display(file_names)

SyntaxError: invalid syntax (2627832139.py, line 37)

**Now we can construct the urls to download files by name**

In [14]:
file_name_type_mapping = dict(zip(file_names['name'].values, file_names['type'].values))
file_name_url_mapping = {}

for fname in file_name_type_mapping:
    ftype = file_name_type_mapping[fname]
    if ftype == "complete":
        url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/'+fname
        file_name_url_mapping[fname] = url
    elif ftype == "byGenre":
        url = 'https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/goodreads/byGenre/'+fname
        file_name_url_mapping[fname] = url

In [15]:
def download_and_upload(fname, local_filename, gcs_blob_name):
    if fname in file_name_url_mapping:
        url = file_name_url_mapping[fname]
        # Download the file locally
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print('Dataset', fname, 'has been downloaded!')

        # Check if the file is a .gz file
        if local_filename.endswith('.gz'):
            extracted_filename = local_filename[:-3]  # Remove .gz extension
            try:
                # Extract the .gz file
                with gzip.open(local_filename, 'rb') as gz_file:
                    with open(extracted_filename, 'wb') as extracted_file:
                        shutil.copyfileobj(gz_file, extracted_file)
                print(f'File {local_filename} has been extracted to {extracted_filename}.')
                
                # Update local filename to the extracted file
                local_filename = extracted_filename
                # Update the GCS blob name to reflect the extracted file
                gcs_blob_name = gcs_blob_name[:-3]
            except Exception as e:
                print(f'Failed to extract .gz file: {e}')
                return

        # Upload the file to Google Cloud Storage in the "landing" folder
        try:
            storage_client = storage.Client()
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(f'landing/{gcs_blob_name}')
            blob.upload_from_filename(local_filename)
            print(f'File {local_filename} uploaded to bucket {bucket_name} under folder landing as landing/{gcs_blob_name}.')
        except Exception as e:
            print(f'Failed to upload file to GCS: {e}')
    else:
        print('Dataset', fname, 'cannot be found!')

**Here we go!**

In [16]:
# Specify the directory we want to save the data to locally, and also the name of the file
OUT_DIR = './extracted_data_from_goodreads'

In [6]:
# We want these files
# - goodreads_reviews_dedup.json.gz
# - goodreads_books.json.gz
# - goodreads_interactions.csv

file_name = 'goodreads_reviews_dedup.json.gz'


if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, file_name)
download_and_upload(file_name, output_path, file_name)

Dataset goodreads_reviews_dedup.json.gz has been downloaded!
File ./extracted_data_from_goodreads/goodreads_reviews_dedup.json.gz has been extracted to ./extracted_data_from_goodreads/goodreads_reviews_dedup.json.
File ./extracted_data_from_goodreads/goodreads_reviews_dedup.json uploaded to bucket goodreads_bucket under folder landing as landing/goodreads_reviews_dedup.json.


In [7]:
# Specify the name of the file
file_name = 'goodreads_books.json.gz'


if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, file_name)
download_and_upload(file_name, output_path, file_name)

Dataset goodreads_books.json.gz has been downloaded!
File ./extracted_data_from_goodreads/goodreads_books.json.gz has been extracted to ./extracted_data_from_goodreads/goodreads_books.json.
File ./extracted_data_from_goodreads/goodreads_books.json uploaded to bucket goodreads_bucket under folder landing as landing/goodreads_books.json.


In [8]:
# Specify the name of the file
file_name = 'goodreads_interactions.csv'


if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, file_name)
download_and_upload(file_name, output_path, file_name)

Dataset goodreads_interactions.csv has been downloaded!
File ./extracted_data_from_goodreads/goodreads_interactions.csv uploaded to bucket goodreads_bucket under folder landing as landing/goodreads_interactions.csv.


In [10]:
# Specify the name of the file
file_name = 'goodreads_book_genres_initial.json.gz'


if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, file_name)
download_and_upload(file_name, output_path, file_name)

Dataset goodreads_book_genres_initial.json.gz has been downloaded!
File ./extracted_data_from_goodreads/goodreads_book_genres_initial.json.gz has been extracted to ./extracted_data_from_goodreads/goodreads_book_genres_initial.json.
File ./extracted_data_from_goodreads/goodreads_book_genres_initial.json uploaded to bucket goodreads_bucket under folder landing as landing/goodreads_book_genres_initial.json.


In [11]:
# Specify the name of the file
file_name = 'goodreads_book_authors.json.gz'


if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

output_path = os.path.join(OUT_DIR, file_name)
download_and_upload(file_name, output_path, file_name)

Dataset goodreads_book_authors.json.gz has been downloaded!
File ./extracted_data_from_goodreads/goodreads_book_authors.json.gz has been extracted to ./extracted_data_from_goodreads/goodreads_book_authors.json.
File ./extracted_data_from_goodreads/goodreads_book_authors.json uploaded to bucket goodreads_bucket under folder landing as landing/goodreads_book_authors.json.
