## Generate raw dataset

In [1]:
!pip install pandas pyarrow scikit-learn

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

In [20]:
from sklearn.model_selection import train_test_split
import pandas as pd
import glob

def parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True, split=False, splitted_output_path=None):
    if is_parquet:
        # Use glob to find all files matching the pattern
        parquet_files = glob.glob(jsonpath_or_parquet_pattern)

        # Initialize an empty DataFrame
        combined_df = pd.DataFrame()

        # Iterate through each file and append to the DataFrame
        for file in parquet_files:
            df = pd.read_parquet(file)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    else:
        with open(jsonpath_or_parquet_pattern) as f:
            combined_df = pd.read_json(f, lines=True)

    # Convert the combined DataFrame to JSON Lines and write to a file
    if split:
        df, splitted_df = train_test_split(combined_df, test_size=0.1, random_state=42)
        df.to_json(jsonl_file_path, orient='records', lines=True)
        splitted_df.to_json(splitted_output_path, orient='records', lines=True)
    else:
        combined_df.to_json(jsonl_file_path, orient='records', lines=True)

`jsonpath_or_parquet_pattern`: Provide pattern of a path to the `parquet` file or a path to the `jsonl` file
- set `is_parquet=False` if providing `jsonl` file path

`jsonl_file_path`: path to the output `jsonl` file

`split`: set to `True` if it needs a split (validation set doesn't exist)

`splitted_output_path`: path to the splitted output `jsonl` file

### If using parquet file:

In [21]:
# Example usage
jsonpath_or_parquet_pattern = '../data/yelp/train-*.parquet'  # parquet file pattern
jsonl_file_path = 'train.jsonl'
splitted_output_path = 'valid.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True, split=True, splitted_output_path=splitted_output_path)

In [22]:
# Example usage
jsonpath_or_parquet_pattern = '../data/yelp/test-*.parquet' # parquet file pattern
jsonl_file_path = 'test.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True)

### If using jsonl file:

In [10]:
# Example usage
jsonpath_or_parquet_pattern = '../data/trec/trec_train.jsonl'
jsonl_file_path = 'train.jsonl'
splitted_output_path = 'valid.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=False, split=True, splitted_output_path=splitted_output_path)

### Push to hub

In [24]:
train_path = "../data/yelp/train.jsonl"
val_path = "../data/yelp/valid.jsonl"
test_path = "../data/yelp/test.jsonl"

dataset_name = "ICKD/yelp-raw"

In [25]:
from datasets import Dataset, DatasetDict, load_dataset

dataset = load_dataset("json", data_files={"train":train_path, "valid":val_path, "test": test_path})

In [26]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 585000
    })
    valid: Dataset({
        features: ['label', 'text'],
        num_rows: 65000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [27]:
filtered_dataset_test = dataset['test'].filter(
    lambda batch: len(batch['text']) <= 1000,
)
filtered_dataset_train = dataset['train'].filter(
    lambda batch: len(batch['text']) <= 1000,
)
filtered_dataset_valid = dataset['valid'].filter(
    lambda batch: len(batch['text']) <= 1000,
)
filtered_dataset = DatasetDict({
    'train': filtered_dataset_train,
    'test': filtered_dataset_test,
    'valid': filtered_dataset_valid
})
filtered_dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/449 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:   1%|▏         | 6/449 [00:00<00:07, 59.78ba/s][A
Creating parquet from Arrow format:   3%|▎         | 13/449 [00:00<00:07, 61.19ba/s][A
Creating parquet from Arrow format:   4%|▍         | 20/449 [00:00<00:06, 61.56ba/s][A
Creating parquet from Arrow format:   6%|▌         | 27/449 [00:00<00:06, 61.77ba/s][A
Creating parquet from Arrow format:   8%|▊         | 34/449 [00:00<00:06, 61.72ba/s][A
Creating parquet from Arrow format:   9%|▉         | 41/449 [00:00<00:06, 61.18ba/s][A
Creating parquet from Arrow format:  11%|█         | 48/449 [00:00<00:06, 61.52ba/s][A
Creating parquet from Arrow format:  12%|█▏        | 55/449 [00:00<00:06, 61.63ba/s][A
Creating parquet from Arrow format:  14%|█▍        | 62/449 [00:01<00:06, 61.00ba/s][A
Creating parquet from Arrow format:  15%|█▌        | 69/449 [0

CommitInfo(commit_url='https://huggingface.co/datasets/ICKD/yelpreview-raw/commit/5e89bd91c673e1ed971a0360f9ff2c0828da17be', commit_message='Upload dataset', commit_description='', oid='5e89bd91c673e1ed971a0360f9ff2c0828da17be', pr_url=None, pr_revision=None, pr_num=None)