In [2]:
import pandas as pd
import requests
import os

In [3]:
DIR = './'

In [4]:
file_names = pd.read_csv(os.path.join(DIR, 'dataset_names.csv'))
display(file_names)


Unnamed: 0,type,name
0,complete,goodreads_book_works.json.gz
1,complete,goodreads_book_authors.json.gz
2,complete,goodreads_book_series.json.gz
3,complete,goodreads_books.json.gz
4,complete,goodreads_book_genres_initial.json.gz
5,byGenre,goodreads_books_children.json.gz
6,byGenre,goodreads_books_comics_graphic.json.gz
7,byGenre,goodreads_books_fantasy_paranormal.json.gz
8,byGenre,goodreads_books_history_biography.json.gz
9,byGenre,goodreads_books_mystery_thriller_crime.json.gz


In [5]:
file_name_type_mapping = dict(zip(file_names['name'].values, file_names['type'].values))
file_name_url_mapping = {}

for fname in file_name_type_mapping:
    ftype = file_name_type_mapping[fname]
    if ftype == "complete":
        url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/goodreads/'+fname
        file_name_url_mapping[fname] = url
    elif ftype == "byGenre":
        url = 'https://mcauleylab.ucsd.edu/public_datasets/gdrive/goodreads/byGenre/'+fname
        file_name_url_mapping[fname] = url

In [6]:
def download_by_name(fname, local_filename):
    if fname in file_name_url_mapping:
        # Ensure the directory for the local file exists
        os.makedirs(os.path.dirname(local_filename), exist_ok=True)
        url = file_name_url_mapping[fname]
        print(f"Attempting to download {fname} from {url} to {local_filename}...")
        try:
            with requests.get(url, stream=True) as r:
                r.raise_for_status() # Raises HTTPError for bad responses (4XX or 5XX)
                with open(local_filename, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            print(f"Dataset '{fname}' has been successfully downloaded to '{local_filename}'!")
            return True # Indicate success
        except requests.exceptions.RequestException as e:
            print(f"Error downloading {fname}: {e}")
            # Clean up potentially incomplete file
            if os.path.exists(local_filename):
                os.remove(local_filename)
            return False # Indicate failure
    else:
        print(f"Dataset '{fname}' URL not found in mapping!")
        return False # Indicate failure

In [13]:
def get_json_gz_fields(filepath):
    """Legge la prima riga JSON valida da un file .json.gz e restituisce le sue chiavi."""
    if not filepath or not os.path.exists(filepath):
        print(f"File non trovato o non scaricato: {filepath}")
        return None
    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            for line in f:
                try:
                    # Rimuovi spazi bianchi extra e prova a parsare
                    cleaned_line = line.strip()
                    if cleaned_line: # Salta righe vuote
                        data = json.loads(cleaned_line)
                        return list(data.keys())
                except json.JSONDecodeError:
                    print(f"Attenzione: Riga non JSON valida saltata nel file {filepath}")
                    continue # Prova la riga successiva
            print(f"Nessuna riga JSON valida trovata in {filepath}")
            return None # Nessuna riga JSON valida trovata
    except Exception as e:
        print(f"Errore durante l'apertura o lettura di {filepath}: {e}")
        return None

def get_csv_fields(filepath):
    """Legge l'header (o le prime righe) di un file CSV e restituisce i nomi delle colonne."""
    if not filepath or not os.path.exists(filepath):
       print(f"File non trovato o non scaricato: {filepath}")
       return None
    try:
        # Leggi solo le prime righe per ottenere l'header
        df = pd.read_csv(filepath, nrows=1)
        return df.columns.tolist()
    except Exception as e:
        print(f"Errore durante la lettura del CSV {filepath}: {e}")
        return None

#-------------------------------------------------------------------------------

def get_json_gz_sample(filepath):
    """
    Extract headers (keys) and one sample JSON object from a .json.gz file.
    """
    import gzip
    import json

    with gzip.open(filepath, 'rt', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                sample_json = json.loads(line)
                headers = list(sample_json.keys())
                return headers, sample_json
    return None, None

def get_csv_sample(filepath):
    """
    Extract headers and one sample line (as dict) from a CSV file.
    """
    import csv

    with open(filepath, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        headers = reader.fieldnames
        sample_line = next(reader, None)
        return headers, sample_line



In [14]:
import os
import pandas as pd
import requests
import gzip  # <--- Add this line
import json

# Define base output directories
OUT_DIR_BASE = './downloaded_datasets'
OUT_DIR_COMPLETE = os.path.join(OUT_DIR_BASE, 'complete')
OUT_DIR_BYGENRE = os.path.join(OUT_DIR_BASE, 'byGenre')

# Create directories if they don't exist
os.makedirs(OUT_DIR_COMPLETE, exist_ok=True)
os.makedirs(OUT_DIR_BYGENRE, exist_ok=True)

print(f"Output directories ensured:")
print(f"- {OUT_DIR_COMPLETE}")
print(f"- {OUT_DIR_BYGENRE}")
print("-" * 30)

# Dictionaries to store results
downloaded_files_paths = {}
extracted_headers = {}

# Loop through all files in the mapping
for fname, ftype in file_name_type_mapping.items():
    print(f"\nProcessing file: {fname} (type: {ftype})")

    # Determine the target directory
    if ftype == "complete":
        target_dir = OUT_DIR_COMPLETE
    elif ftype == "byGenre":
        target_dir = OUT_DIR_BYGENRE
    else:
        print(f"Warning: Unknown file type '{ftype}' for {fname}. Skipping.")
        continue

    # Construct the full local path for the download
    local_filepath = os.path.join(target_dir, fname)
    downloaded_files_paths[fname] = local_filepath # Store path regardless of success for tracking

    # --- MODIFICATION START ---
    # Check if the file already exists before attempting download
    if os.path.exists(local_filepath):
        print(f"File already exists: '{local_filepath}'. Skipping download.")
        download_success = True  # Treat existing file as success for header extraction logic
    else:
        # Download the file only if it doesn't exist
        download_success = download_by_name(fname, local_filepath)
    # --- MODIFICATION END ---

    # If download was successful OR the file already existed, attempt to extract headers
    if download_success:
        print(f"Attempting to extract headers from '{local_filepath}'...")
        headers = None
        try:
            if fname.endswith('.json.gz'):
                headers, sample_line = get_json_gz_sample(local_filepath)
            elif fname.endswith('.csv'): # Add checks for other types if needed
                 headers, sample_line = get_csv_sample(local_filepath)
            # Add elif for other file types like .csv.gz etc. if necessary
            # elif fname.endswith('.csv.gz'):
            #    headers = get_csv_gz_fields(local_filepath) # Assuming you'd create this function
            else:
                print(f"Unsupported file extension for header extraction: {fname}")

            if headers:
                extracted_headers[fname] = headers
                print(f"Successfully extracted headers for {fname}:\n{headers}")
                print(f"Sample line for {fname}:\n{sample_line}")
            else:
                # get_json_gz_fields or get_csv_fields might print specific errors
                print(f"Could not extract headers for {fname} (file might be empty, invalid, or unsupported).")

        except Exception as e:
            print(f"An unexpected error occurred during header extraction for {fname}: {e}")
    else:
        # This block is now only reached if the file didn't exist AND the download failed
        print(f"Skipping header extraction for {fname} due to download failure.")

print("\n" + "="*40)
print("Processing Complete.")
print(f"Total files listed: {len(file_name_type_mapping)}")
print(f"Files processed for headers (existing or downloaded): {len(downloaded_files_paths)}") # Might be slightly different if skipping unknown types
print(f"Files for which headers were successfully extracted: {len(extracted_headers)}")
print("="*40)



Output directories ensured:
- ./downloaded_datasets/complete
- ./downloaded_datasets/byGenre
------------------------------

Processing file: goodreads_book_works.json.gz (type: complete)
File already exists: './downloaded_datasets/complete/goodreads_book_works.json.gz'. Skipping download.
Attempting to extract headers from './downloaded_datasets/complete/goodreads_book_works.json.gz'...
Successfully extracted headers for goodreads_book_works.json.gz:
['books_count', 'reviews_count', 'original_publication_month', 'default_description_language_code', 'text_reviews_count', 'best_book_id', 'original_publication_year', 'original_title', 'rating_dist', 'default_chaptering_book_id', 'original_publication_day', 'original_language_id', 'ratings_count', 'media_type', 'ratings_sum', 'work_id']
Sample line for goodreads_book_works.json.gz:
{'books_count': '1', 'reviews_count': '6', 'original_publication_month': '8', 'default_description_language_code': '', 'text_reviews_count': '1', 'best_book_id