In [10]:
!pip install pandas pyarrow scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd
import glob

def parquet_to_jsonl(parquet_files_pattern, jsonl_file_path, split=False, splitted_output_path=None):
    # Use glob to find all files matching the pattern
    parquet_files = glob.glob(parquet_files_pattern)

    # Initialize an empty DataFrame
    combined_df = pd.DataFrame()

    # Iterate through each file and append to the DataFrame
    for file in parquet_files:
        df = pd.read_parquet(file)
        combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Convert the combined DataFrame to JSON Lines and write to a file
    if split:
        df, splitted_df = train_test_split(combined_df, test_size=0.1, random_state=42)
        df.to_json(jsonl_file_path, orient='records', lines=True)
        splitted_df.to_json(splitted_output_path, orient='records', lines=True)
    else:
        combined_df.to_json(jsonl_file_path, orient='records', lines=True)

In [12]:
# Example usage
parquet_files_pattern = '../data/agnews/train-*.parquet'
jsonl_file_path = 'train.jsonl'
splitted_output_path = 'valid.jsonl'
parquet_to_jsonl(parquet_files_pattern, jsonl_file_path, split=True, splitted_output_path=splitted_output_path)

In [13]:
# Example usage
parquet_files_pattern = '../data/agnews/test-*.parquet'
jsonl_file_path = 'test.jsonl'
parquet_to_jsonl(parquet_files_pattern, jsonl_file_path)

In [15]:
from datasets import Dataset, DatasetDict, load_dataset

In [19]:
dataset = load_dataset("json", data_files={"train":"train.jsonl", "valid":"valid.jsonl", "test": "test.jsonl"})

Generating train split: 108000 examples [00:00, 1955859.28 examples/s]
Generating valid split: 12000 examples [00:00, 1731156.63 examples/s]
Generating test split: 7600 examples [00:00, 1353690.78 examples/s]


In [29]:
dataset.push_to_hub("ICKD/agnews")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/108 [00:00<?, ?ba/s][A
Creating parquet from Arrow format: 100%|██████████| 108/108 [00:00<00:00, 877.72ba/s][A
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.88s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 12/12 [00:00<00:00, 827.80ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]
Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 880.69ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:02<00:00,  2.00s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/ICKD/agnews/commit/428a388be4f3792fd724314d662071152990fb7f', commit_message='Upload dataset', commit_description='', oid='428a388be4f3792fd724314d662071152990fb7f', pr_url=None, pr_revision=None, pr_num=None)