In [None]:
import os
import requests
import polars as pl
from datasets import load_dataset
import json

# Use the specified data directory path
data_dir = "/Users/tommasofurlanello/Documents/Dev/MarketInference/data"
os.makedirs(data_dir, exist_ok=True)

def download_gutenberg_dataset():
    """
    Download the Gutenberg English dataset from Hugging Face and save to /data folder
    """
    print("Downloading Gutenberg English dataset...")
    
    # Method 1: Using the datasets library
    try:
        dataset = load_dataset("sedthh/gutenberg_english")
        # Save each split to parquet files for faster loading
        for split in dataset:
            output_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
            print(f"Saving {split} split to {output_path}")
            dataset[split].to_parquet(output_path)
        return True
    except Exception as e:
        print(f"Error using datasets library: {e}")
        print("Trying alternative download method...")
    
    # Method 2: Manual download if datasets library fails
    try:
        # Get dataset info
        info_url = "https://huggingface.co/datasets/sedthh/gutenberg_english/resolve/main/dataset_infos.json"
        response = requests.get(info_url)
        response.raise_for_status()
        info = json.loads(response.text)
        
        splits = ["train", "test", "validation"]
        for split in splits:
            # Download each split
            split_url = f"https://huggingface.co/datasets/sedthh/gutenberg_english/resolve/main/{split}-00000-of-00001.parquet"
            output_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
            
            print(f"Downloading {split} split from {split_url}")
            response = requests.get(split_url)
            response.raise_for_status()
            
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Saved to {output_path}")
        
        return True
    except Exception as e:
        print(f"Error in alternative download method: {e}")
        return False

def load_as_polars():
    """
    Load all splits of the Gutenberg dataset as a single Polars DataFrame
    """
    splits = ["train", "test", "validation"]
    dataframes = []
    
    for split in splits:
        file_path = os.path.join(data_dir, f"gutenberg_{split}.parquet")
        if os.path.exists(file_path):
            print(f"Loading {file_path}...")
            df = pl.read_parquet(file_path)
            # Add a column to identify the split
            df = df.with_columns(pl.lit(split).alias("split"))
            dataframes.append(df)
        else:
            print(f"Warning: {file_path} not found")
    
    if not dataframes:
        raise FileNotFoundError(f"No dataset files found in {data_dir} directory")
    
    # Combine all dataframes
    combined_df = pl.concat(dataframes)
    print(f"Combined DataFrame shape: {combined_df.shape}")
    print(f"Combined DataFrame schema:\n{combined_df.schema}")
    
    return combined_df

In [8]:
download_gutenberg_dataset()

Downloading Gutenberg English dataset...


Downloading data: 100%|██████████| 37/37 [03:56<00:00,  6.39s/files]
Generating train split: 100%|██████████| 48284/48284 [00:16<00:00, 3006.82 examples/s]


Saving train split to /Users/tommasofurlanello/Documents/Dev/MarketInference/data/gutenberg_train.parquet


Creating parquet from Arrow format: 100%|██████████| 49/49 [00:41<00:00,  1.19ba/s]


True

In [9]:
df = load_as_polars()

Loading /Users/tommasofurlanello/Documents/Dev/MarketInference/data/gutenberg_train.parquet...
Combined DataFrame shape: (48284, 4)
Combined DataFrame schema:
Schema({'TEXT': String, 'SOURCE': String, 'METADATA': String, 'split': String})


In [14]:
unnested_df = df.with_columns(pl.col("METADATA").str.json_decode()).unnest("METADATA")

In [17]:
unnested_df["subjects"][52]

'Historical fiction; War stories; United States -- History -- Civil War, 1861-1865 -- Fiction; Virginia -- History -- Civil War, 1861-1865 -- Fiction; Chancellorsville, Battle of, Chancellorsville, Va., 1863 -- Fiction'

In [20]:
novels = unnested_df.filter(pl.col("subjects").str.contains("fiction"))

In [30]:
novels.write_parquet("/Users/tommasofurlanello/Documents/Dev/MarketInference/data/gutenberg_en_novels.parquet")

In [29]:
novels["TEXT"][0][114:2500]

'CHAPTER I.Down the Rabbit-Hole\r\n\r\n\r\n CHAPTER II.The Pool of Tears\r\n\r\n\r\n CHAPTER III.A Caucus-Race and a Long Tale\r\n\r\n\r\n CHAPTER IV.The Rabbit Sends in a Little Bill\r\n\r\n\r\n CHAPTER V.Advice from a Caterpillar\r\n\r\n\r\n CHAPTER VI.Pig and Pepper\r\n\r\n\r\n CHAPTER VII.A Mad Tea-Party\r\n\r\n\r\n CHAPTER VIII.The Queen’s Croquet-Ground\r\n\r\n\r\n CHAPTER IX.The Mock Turtle’s Story\r\n\r\n\r\n CHAPTER X.The Lobster Quadrille\r\n\r\n\r\n CHAPTER XI.Who Stole the Tarts?\r\n\r\n\r\n CHAPTER XII.Alice’s Evidence\r\n\r\n\r\nCHAPTER I.\r\n\r\nDown the Rabbit-Hole\r\n\r\n\r\nAlice was beginning to get very tired of sitting by her sister on the bank, and\r\n\r\nof having nothing to do: once or twice she had peeped into the book her sister\r\n\r\nwas reading, but it had no pictures or conversations in it, “and what is\r\n\r\nthe use of a book,” thought Alice “without pictures or\r\n\r\nconversations?”\r\n\r\n\r\nSo she was considering in her own mind (as well as she coul