In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
import os
from datasets import load_dataset
import configparser

def read_config(config_file='config.ini'):
    """
    Reads the configuration file and returns the settings as a dictionary.

    Args:
        config_file (str): Path to the configuration file.

    Returns:
        dict: A dictionary containing the configuration settings.
    """
    config = configparser.ConfigParser()

    # Check if the config file exists
    if not os.path.exists(config_file):
        raise FileNotFoundError(f"The configuration file '{config_file}' does not exist.")

    config.read(config_file)
    # Convert config sections to a dictionary
    config_dict = {section: dict(config.items(section)) for section in config.sections()}

    return config_dict

class DataLoader:
    def __init__(self, config):
        """
        Initialize the DataLoader with the dataset name, language code, and local path.
        :param language_code: Language code ("es" for Spanish, "en" for English)
        """
        self.dataset_name = config["dataset"]['dataset_name']
        self.local_data_path = config["dataset"]['local_data_path']

    def load_data(self):
        """
        Load data from local storage if available, otherwise download it.

        :return: Dataset object containing the loaded data
        """
        if os.path.exists(self.local_data_path):
            print(f"Loading data from {self.local_data_path}...")
            return self._load_local_data()
        else:
            print(f"Data not found locally. Downloading {self.dataset_name}...")
            dataset = self._download_data()
            self._save_local_data(dataset)
            return dataset

    def _load_local_data(self):
        """
        Load the dataset from local storage.

        :return: Dataset object containing the loaded data
        """
        return load_dataset(self.local_data_path)

    def _download_data(self):
        """
        Download data from the specified dataset and language code.

        :return: Dataset object containing the downloaded data
        """
        return load_dataset(self.dataset_name)

    def _save_local_data(self, dataset):
        """
        Save the dataset to local storage.

        :param dataset: Dataset object to save
        """
        # Save the dataset to local storage (this can be customized as needed)
        dataset.save_to_disk(self.local_data_path)
        print(f"Data downloaded and saved to {self.local_data_path}.")




In [None]:
# Define parameters
config = read_config()

# Create instances of DataLoader
data_loader = DataLoader(config)

# Load the datasets
dataset = data_loader.load_data()

# Display the first few entries of the English dataset
print(dataset)

Data not found locally. Downloading ccdv/arxiv-summarization...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.96k [00:00<?, ?B/s]

train-00000-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00001-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00002-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00003-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00004-of-00015.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

train-00005-of-00015.parquet:   0%|          | 0.00/227M [00:00<?, ?B/s]

train-00006-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00007-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00008-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00009-of-00015.parquet:   0%|          | 0.00/228M [00:00<?, ?B/s]

train-00010-of-00015.parquet:   0%|          | 0.00/229M [00:00<?, ?B/s]

train-00011-of-00015.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

train-00012-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00013-of-00015.parquet:   0%|          | 0.00/230M [00:00<?, ?B/s]

train-00014-of-00015.parquet:   0%|          | 0.00/235M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/203037 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [None]:
dataset.set_format("pandas")
df = dataset["train"][:]

In [None]:
dataset['train'][:]

Unnamed: 0,article,abstract
0,additive models @xcite provide an important fa...,additive models play an important role in semi...
1,the leptonic decays of a charged pseudoscalar ...,"we have studied the leptonic decay @xmath0 , v..."
2,the transport properties of nonlinear non - eq...,"in 84 , 258 ( 2000 ) , mateos conjectured that..."
3,studies of laser beams propagating through tur...,the effect of a random phase diffuser on fluct...
4,the so - called `` nucleon spin crisis '' rais...,with a special intention of clarifying the und...
...,...,...
203032,"e. rasmusen , _ games and information : an int...",effects of a corrupt source on the dynamics of...
203033,"the magnetocaloric effect , _ \n i.e. _ , a te...",we compute the entropy of antiferromagnetic qu...
203034,"as expected , the most interesting combinatori...",as a generalization of orbit - polynomial and ...
203035,by numerical study we find that the branch cut...,"within the lowest - order born approximation ,..."
