## Generate raw dataset

In [None]:
!pip install pandas pyarrow scikit-learn

In [9]:
from sklearn.model_selection import train_test_split
import pandas as pd
import glob

def parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True, split=False, splitted_output_path=None):
    if is_parquet:
        # Use glob to find all files matching the pattern
        parquet_files = glob.glob(jsonpath_or_parquet_pattern)

        # Initialize an empty DataFrame
        combined_df = pd.DataFrame()

        # Iterate through each file and append to the DataFrame
        for file in parquet_files:
            df = pd.read_parquet(file)
            combined_df = pd.concat([combined_df, df], ignore_index=True)
    
    else:
        with open(jsonpath_or_parquet_pattern) as f:
            combined_df = pd.read_json(f, lines=True)

    # Convert the combined DataFrame to JSON Lines and write to a file
    if split:
        df, splitted_df = train_test_split(combined_df, test_size=0.1, random_state=42)
        df.to_json(jsonl_file_path, orient='records', lines=True)
        splitted_df.to_json(splitted_output_path, orient='records', lines=True)
    else:
        combined_df.to_json(jsonl_file_path, orient='records', lines=True)

`jsonpath_or_parquet_pattern`: Provide pattern of a path to the `parquet` file or a path to the `jsonl` file
- set `is_parquet=False` if providing `jsonl` file path

`jsonl_file_path`: path to the output `jsonl` file

`split`: set to `True` if it needs a split (validation set doesn't exist)

`splitted_output_path`: path to the splitted output `jsonl` file

### If using parquet file:

In [12]:
# Example usage
jsonpath_or_parquet_pattern = '../data/agnews/train-*.parquet'  # parquet file pattern
jsonl_file_path = 'train.jsonl'
splitted_output_path = 'valid.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True, split=True, splitted_output_path=splitted_output_path)

In [13]:
# Example usage
jsonpath_or_parquet_pattern = '../data/agnews/test-*.parquet' # parquet file pattern
jsonl_file_path = 'test.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=True)

### If using jsonl file:

In [10]:
# Example usage
jsonpath_or_parquet_pattern = '../data/trec/trec_train.jsonl'
jsonl_file_path = 'train.jsonl'
splitted_output_path = 'valid.jsonl'
parquet_to_jsonl(jsonpath_or_parquet_pattern, jsonl_file_path, is_parquet=False, split=True, splitted_output_path=splitted_output_path)

### Push to hub

In [11]:
train_path = ""
val_path = ""
test_path = ""

dataset_name = "ICKD/<replace-with-dataset-name>-raw"

In [None]:
from datasets import Dataset, DatasetDict, load_dataset

dataset = load_dataset("json", data_files={"train":train_path, "valid":val_path, "test": test_path})
dataset.push_to_hub(dataset_name, private=True)