In [1]:
#split data into train, test, and validation

import os
import random
import json
import uuid

data_path = "/home/marthee/advise-gpt-1/itanong/dataset"

# Load all files in jsonl format
all_data = []
print("Reading all data for dataset train and test split")
for file in os.listdir(data_path):
    if file.endswith(".jsonl"):
        print(f"Reading {file}")
        with open(os.path.join(data_path, file), "r") as f:
            data = f.readlines()
            all_data.extend(data)

print(f"Total data: {len(all_data)}")
print("Reading all data for dataset validation split")


Reading all data for dataset train and test split
Reading llama3-formatted-data0.jsonl
Reading llama3-formatted-data4.jsonl
Reading llama3-formatted-data3.jsonl
Reading llama3-formatted-data1.jsonl
Reading llama3-formatted-data6.jsonl
Reading llama3-formatted-data10.jsonl
Reading llama3-formatted-data8.jsonl
Reading llama3-formatted-data2.jsonl
Reading llama3-formatted-data7.jsonl
Reading llama3-formatted-data9.jsonl
Reading llama3-formatted-data5.jsonl
Total data: 10779
Reading all data for dataset validation split


In [2]:
TRAIN_PCT, TEST_PCT = 0.9, 0.1

random.seed(42)

random.shuffle(all_data)

# random split for new data
train_data = all_data[:int(len(all_data)*TRAIN_PCT)]

val_data = all_data[int(len(all_data)*TRAIN_PCT):]

#Shuffle data with seed
random.shuffle(train_data)
random.shuffle(val_data)

In [3]:
len(train_data), len(val_data)

(9701, 1078)

In [4]:
def save_jsonl(data, path):
    datapoints_list = []
    with open(path, "w") as f:
        counter = 0
        for line1 in data:
            datapoint = {}
            try:
                line = json.loads(line1)
            except Exception as e:
                line = line1

            datapoint["question"] = line["question"]
            datapoint["response"] = line["response"]
                
        
            #do not save if duplicate
            if datapoint["question"] in datapoints_list:
                print(f"Duplicate datapoint: {datapoint}")

            else:
                
                datapoints_list.append(datapoint["question"])

                to_write = {}
                to_write["id"] = uuid.uuid4().hex
                to_write["question"] = datapoint["question"]
                to_write["response"] = datapoint["response"]
            
                if isinstance(line, dict):
                    f.write(json.dumps(to_write) + "\n")
                elif isinstance(line, str):
                    try:
                        json_object = json.loads(to_write)
                        f.write(json.dumps(json_object) + "\n")
                    except json.JSONDecodeError:
                        print(f"Invalid JSON: {to_write}")
                else:
                    print(f"Invalid data type: {type(to_write)}")
                counter += 1



In [6]:
save_jsonl(train_data, f"{data_path}/hf_dataset/train.jsonl")
# save_jsonl(test_data, f"{data_path}/hf_dataset/test.jsonl")
save_jsonl(val_data, f"{data_path}/hf_dataset/val.jsonl")

Duplicate datapoint: {'question': 'How much customs duty do I need to pay for 2000 coconuts from Laguna to Iloilo?', 'response': {'product_description': 'Coconuts, Fresh', 'task': 'duty_rate'}}
Duplicate datapoint: {'question': 'What is the duty rate for shelled pine nuts exported from Cagayan Valley to Davao City?', 'response': {'product_description': 'Pine Nuts, Shelled', 'task': 'duty_rate'}}
Duplicate datapoint: {'question': 'What is the duty rate for quinces exported from Iloilo to Cebu?', 'response': {'product_description': 'Quinces', 'task': 'duty_rate'}}
Duplicate datapoint: {'question': 'If I export 4000kg of clementines from Bukidnon to Cagayan de Oro, what is the customs duty?', 'response': {'product_description': 'Clementines', 'task': 'duty_rate'}}
Duplicate datapoint: {'question': 'If I export 1000kg of fresh mangoes from Guimaras to Manila, how much dutiable value/freight value/customs duty should I pay?', 'response': {'product_description': 'Mangoes, fresh', 'task': 'du

In [7]:
from datasets import load_dataset, Features, Value

new_data_path = "/home/marthee/advise-gpt-1/itanong/dataset/hf_dataset"


data_files = {"train": f"{new_data_path}/train.jsonl",
            # "test": f"{new_data_path}/test.jsonl",
            "val": f"{new_data_path}/val.jsonl"}
# Define the data schema/
features = Features({
    'id': Value('string'),
    'question': Value('int32'),
})

dataset = load_dataset("json", data_files=data_files)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 9555 examples [00:00, 1370982.99 examples/s]
Generating val split: 1072 examples [00:00, 177824.56 examples/s]


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'response'],
        num_rows: 9555
    })
    val: Dataset({
        features: ['id', 'question', 'response'],
        num_rows: 1072
    })
})

In [9]:
!huggingface-cli login --token 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
!huggingface-cli whoami

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/marthee/.cache/huggingface/token
Login successful
ninyx


In [10]:
dataset.push_to_hub("itanong_dataset-v0.1")

Creating parquet from Arrow format: 100%|██████████| 10/10 [00:00<00:00, 1522.43ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 1885.50ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.27s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/ninyx/itanong_dataset-v0.1/commit/290da1919e961b4eed2c3480250146d11830ce79', commit_message='Upload dataset', commit_description='', oid='290da1919e961b4eed2c3480250146d11830ce79', pr_url=None, pr_revision=None, pr_num=None)