In [1]:
from datasets import Dataset, load_dataset
import os

In [3]:
data = load_dataset(path="../data/newssapo",
                    data_files=[file for file in os.listdir("../data/newssapo") if (file.startswith("baomoi") and file.endswith(".parquet"))],
                    split="train",
                    num_proc=64)

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

def _filter_empty(sample):
    if (sample['title'] is None) or (sample['sapo'] is None) or (sample['body_text'] is None):
        return False
    if len(sample['body_text'].split()) >= 16384:
        return False
    return True

# def _map_length(sample):
#     sample['length'] = len(tokenizer(sample['body_text'])['input_ids'])
#     return sample

filtered_data = data.filter(_filter_empty, num_proc=64)
# filtered_data = filtered_data.map(_map_length, num_proc=64)

Resolving data files:   0%|          | 0/65 [00:00<?, ?it/s]

Filter (num_proc=64):   0%|          | 0/31728183 [00:00<?, ? examples/s]

In [4]:
filtered_data_pd = filtered_data.to_pandas()

In [5]:
filtered_data_pd.drop_duplicates(keep='first', inplace=True)

In [6]:
final_data = filtered_data.select(filtered_data_pd.index.values)

In [8]:
final_data

Dataset({
    features: ['sapo', 'title', 'body_text'],
    num_rows: 23005054
})

In [7]:
final_data.to_parquet("../data/newssapo/baomoi_combined.parquet")

Creating parquet from Arrow format:   0%|          | 0/23006 [00:00<?, ?ba/s]

86933855327

### Dedup

In [1]:
from datasets import load_dataset
import json
import pandas as pd
import os

In [2]:
dedup_dataset = load_dataset("arrow",
                            data_dir="../data/newssapo/baomoi_dedup", data_files=[file for file in os.listdir("../data/newssapo/baomoi_dedup") if file.endswith(".arrow")],
                            split="train", num_proc=32)

Resolving data files:   0%|          | 0/141 [00:00<?, ?it/s]

In [6]:
dedup_dataset

Dataset({
    features: ['sapo', 'title', 'body_text'],
    num_rows: 18640933
})

In [4]:
import random

def create_train_data(sample):
    # Already check that sapo and body_text are not empty string at the same time
    if sample['sapo'] == '':
        query_column, positive_column = "title", "body_text"
    elif sample['body_text'] == '':
        query_column, positive_column = "title", "sapo"
    else:
        query_column, positive_column = random.choice([("title", "sapo"), ("title", "body_text"), ("sapo", "body_text")])

    return {
        "query": sample[query_column],
        "pos": [sample[positive_column]]
    }

train_dataset = dedup_dataset.map(create_train_data, remove_columns=["title", "sapo", "body_text"], num_proc=64)

In [5]:
train_dataset.to_json("../data/final/train/generic/newssapo.jsonl", orient="records", lines=True, num_proc=64)

Creating json from Arrow format:   0%|          | 0/18641 [00:00<?, ?ba/s]

74569759224

In [3]:
from datasets import load_dataset
import os

data_dir = "../data/final/train/generic/newssapo_splitted"

for data_file in os.listdir(data_dir):
    if data_file.endswith(".jsonl"):
        dataset = load_dataset("json", data_files=os.path.join(data_dir, data_file), split="train", num_proc=32).select_columns(["query", "pos"])
        dataset.to_json(os.path.join(data_dir, data_file), orient="records", lines=True, num_proc=32)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/881 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/77 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/38 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Setting num_proc from 32 back to 1 for the train split to disable multiprocessing as it only contains one shard.


Generating train split: 0 examples [00:00, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [2]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_folder(
    folder_path="../data/final/train/generic/newssapo_splitted",
    path_in_repo="train_generic/newssapo_filtered_splitted",
    repo_id="nntoan209/GenericTraining",
    repo_type="dataset",
    token="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"
)

baomoi_perplexity_filtered_len-0-512.jsonl:   0%|          | 0.00/760M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-2048-3072.jsonl:   0%|          | 0.00/111M [00:00<?, ?B/s]

.split_log:   0%|          | 0.00/462 [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-3072-4096.jsonl:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-1024-2048.jsonl:   0%|          | 0.00/439M [00:00<?, ?B/s]

Upload 10 LFS files:   0%|          | 0/10 [00:00<?, ?it/s]

baomoi_perplexity_filtered_len-4096-5120.jsonl:   0%|          | 0.00/19.4M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-512-1024.jsonl:   0%|          | 0.00/472M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-5120-6144.jsonl:   0%|          | 0.00/9.27M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-6144-7168.jsonl:   0%|          | 0.00/5.25M [00:00<?, ?B/s]

baomoi_perplexity_filtered_len-7168-inf.jsonl:   0%|          | 0.00/6.35M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/nntoan209/GenericTraining/commit/3405234b9042fb040bf98a99341c14cb2a9f0db3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='3405234b9042fb040bf98a99341c14cb2a9f0db3', pr_url=None, pr_revision=None, pr_num=None)