## Download dataset from remote server

In [3]:
import datasets

dataset_id = "alturing/gutenberg-texts"

# Load the dataset by name
dataset = datasets.load_dataset(dataset_id)

# Download the dataset files
print(dataset)

""" 
DatasetDict({
    train: Dataset({
        features: ['title', 'author', 'text', 'language'],
        num_rows: 2951
    })
})

"""

DatasetDict({
    train: Dataset({
        features: ['title', 'author', 'text', 'language'],
        num_rows: 2951
    })
})


" \nDatasetDict({\n    train: Dataset({\n        features: ['title', 'author', 'text', 'language'],\n        num_rows: 2951\n    })\n})\n\n"

In [4]:
# Save the dataset to a local folder
local_folder_path = r"C:\Users\kinla\Documents\All_github_repo\datasets\gutenberg"
dataset.save_to_disk(local_folder_path)


Saving the dataset (0/2 shards):   0%|          | 0/2951 [00:00<?, ? examples/s]

In [5]:
# View the set of files in that folder

import os

local_folder_path = r"C:\Users\kinla\Documents\All_github_repo\datasets\gutenberg"

def list_files_and_folders(directory_path, indent=""):
    print(f"{indent}+ {os.path.basename(directory_path)}")
    indent += "    "
    
    # Separate different directories for clarity
    sline = "-" * 30
    print(f"{indent}{sline}")
    
    for filename in os.listdir(directory_path):
        filepath = os.path.join(directory_path, filename)
        if os.path.isdir(filepath):
            list_files_and_folders(filepath, indent)
        else:
            print(f"{indent}- {filename}")

    print(f"{indent}{sline}")

# Call the function to list files in the specified directory
list_files_and_folders(local_folder_path)


+ gutenberg
    ------------------------------
    - dataset_dict.json
    + train
        ------------------------------
        - data-00000-of-00002.arrow
        - data-00001-of-00002.arrow
        - dataset_info.json
        - state.json
        ------------------------------
    ------------------------------


## Open PyArrow files

In [6]:
from datasets import load_dataset

# Path to the directory containing the dataset
dataset_path = local_folder_path

# Load dataset from arrow shards
dataset = load_dataset("arrow", data_files=[f"{dataset_path}/train/data-00000-of-00002.arrow", 
                                            f"{dataset_path}/train/data-00001-of-00002.arrow"])

# Access dataset elements
print(dataset)  # Access the first data point in the dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['title', 'author', 'text', 'language'],
        num_rows: 2951
    })
})


## Explore PyArrow dataset

In [7]:
print(dataset) # it returns DatasetDict, because the directory contains multiple folders
print(dataset['train']) # it returns Dataset, because we also indicate the name of the folder
print(dataset['train']['title'])
print(dataset['train']['text'][0])

DatasetDict({
    train: Dataset({
        features: ['title', 'author', 'text', 'language'],
        num_rows: 2951
    })
})
Dataset({
    features: ['title', 'author', 'text', 'language'],
    num_rows: 2951
})
["Memoirs of a griffin         Or, A cadet's first year in India    ", 'Maggie Lee!         Bad spelling, Diamonds, The answered prayer    ', 'The unwelcome child         Or, The crime of an undesigned and undesired maternity    ', 'The Nightingale    ', 'Re-creations    ', 'Le supplice de PhÃ¨dre    ', 'Gypsy folk-tales    ', "Father Duffy's story    ", 'Home education         Home Education Series (Vol. 1 of 6)    ', 'The outcast    ', 'Satuja         Lukemisia Suomen rahvaalle Pietarista II    ', 'Anna Hollmannin hÃ¤viÃ¶         Kertomus    ', 'The rag pickers         and other stories    ', 'Little maid Marigold    ', 'Her country    ', 'Seetrien alla         Romaani    ', 'Satuja ja tarinoita VII    ', 'Natalika    ', 'Mere mortals         Medico-historical essays    ', 

## Explore dataset with pandas

In [14]:
import pandas as pd

df = pd.DataFrame.from_dict(dataset['train'])

def objprint(object):
    """
    This function takes an object as input, prints the name of the object
    (in this case, the type or name of the function) and then prints the
    output of that object when called. 

    Parameters:
    object (callable): An object (function, class, etc.) that can be called
                       to produce an output.
    """
    func_name = str(object)  # Get the name of the object as a string
    func_output = object   # Call the object to get its output
    
    print(f" The object name is: {func_name}")         # Print the name of the object
    print(f" The object output is: {func_output}")     # Print the output of the object

objprint(df.head)
objprint(df.describe())
objprint(df.info())

 The object name is: <bound method NDFrame.head of                                                   title  \
0     Memoirs of a griffin         Or, A cadet's fir...   
1     Maggie Lee!         Bad spelling, Diamonds, Th...   
2     The unwelcome child         Or, The crime of a...   
3                                   The Nightingale       
4                                      Re-creations       
...                                                 ...   
2946                               Exile From Venus       
2947                          Into the Frozen South       
2948                                     A tragikum       
2949  Die KringhÃ¤usler         Drama in drei Akten       
2950                                                      

                        author  \
0        Francis John  Bellew    
1              Mary J. Holmes    
2             Henry C. Wright    
3     Hans Christian Andersen    
4       Grace Livingston Hill    
...                        ...   
29

In [33]:
df_flattened = pd.json_normalize(dataset['train'], sep='_')
print(df_flattened.head())

                                               title  \
0  Memoirs of a griffin         Or, A cadet's fir...   
1  Maggie Lee!         Bad spelling, Diamonds, Th...   
2  The unwelcome child         Or, The crime of a...   
3                                The Nightingale       
4                                   Re-creations       

                     author  \
0     Francis John  Bellew    
1           Mary J. Holmes    
2          Henry C. Wright    
3  Hans Christian Andersen    
4    Grace Livingston Hill    

                                                text  language  
0  ï»¿The Project Gutenberg eBook of Memoirs of a...  English   
1  ï»¿The Project Gutenberg eBook of Maggie Lee!,...  English   
2  ï»¿The Project Gutenberg eBook of The unwelcom...  English   
3  ï»¿The Project Gutenberg eBook of The Nighting...  English   
4  ï»¿The Project Gutenberg eBook of Re-creations...  English   
