In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import sys
current_dir = os.getcwd()
sys.path.append(os.path.join(os.getcwd(), '..'))

import random

import torch
# Check if a GPU is available
if torch.cuda.is_available():
    # Get the current device index (default is 0 if no other device is specified)
    current_device = torch.cuda.current_device()
    
    # Get the name of the GPU at this device index
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU: {gpu_name}")
else:
    print("No GPU available.")

import pandas as pd

from tqdm import tqdm
from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from utils.helper import convert_to_llama_format

print("Import Successful!")

Current GPU: Tesla P40
Import Successful!


In [3]:
# Create an empty dataset using a pandas DataFrame
empty_df = pd.DataFrame(columns=["conversations", "tag"])  # Define your schema here
empty_dataset = Dataset.from_pandas(empty_df)

### aixsatoshi/cosmopedia-japanese-100k

In [4]:
repo_id = "aixsatoshi/cosmopedia-japanese-100k"
dataset = load_dataset(repo_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['format', 'japanese', 'english', 'audience', 'seed_data'],
        num_rows: 99866
    })
})

In [5]:
# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]

sample = train_data[random.choice(range(train_data.num_rows))] #train_data[0]

messages = [
    {"role": "system", "content": "You are a translator, your job is translate English to Japanese"},
    {"role": "user", "content" : "Translate the given content to Japanese"},
    {"role": "user", "content" : sample['english']},
    {"role": "assistant", "content" : sample['japanese']},
     ]
messages

Available Splits: dict_keys(['train'])


[{'role': 'system',
  'content': 'You are a translator, your job is translate English to Japanese'},
 {'role': 'user', 'content': 'Translate the given content to Japanese'},
 {'role': 'user',
  'content': " The world of education is vast and diverse, encompassing everything from kindergarten classrooms to doctoral programs. But when it comes to the internet, there's one category of domains that is strictly reserved for a specific type of educational institution: the .edu domain. According to the extract provided, this domain is only available to post-secondary institutions that are accredited at the institutional level by agencies recognized by the US Department of Education. This means that elementary schools, high schools, middle schools, and even charter schools don't qualify for this prestigious designation.\n\nBut what does it really mean to be an accredited institution? And why is the .edu domain so important? Let's dive deeper into these questions and explore some of the nuances

In [6]:
# Loop through the source dataset and append data to the empty dataset
new_data = {"conversations": [], "tag": []}  # Initialize dictionary to store new data
for row in tqdm(train_data):
    # Create messages
    messages = [
    {"role": "system", "content": "You are a translator, your job is translate English to Japanese"},
    {"role": "user", "content" : "Translate the given content to Japanese"},
    {"role": "user", "content" : sample['english']},
    {"role": "assistant", "content" : sample['japanese']},
     ]
    
    new_data["conversations"].append(messages)
    new_data["tag"].append(repo_id)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 99866/99866 [00:02<00:00, 34178.08it/s]


In [7]:
# Add the collected data to the empty dataset
updated_dataset = Dataset.from_dict(new_data)

# Concatenate them
empty_dataset = concatenate_datasets([empty_dataset, updated_dataset])

# Print the resulting dataset
print(empty_dataset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 99866
})


### shi3z/Japanese_wikipedia_conversation_100K

In [8]:
repo_id = "shi3z/Japanese_wikipedia_conversation_100K"
dataset = load_dataset(repo_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations'],
        num_rows: 110322
    })
})

In [9]:
# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]

sample = train_data[random.choice(range(train_data.num_rows))] #train_data[0]

messages = convert_to_llama_format(sample['conversations'])

messages

Available Splits: dict_keys(['train'])


[{'role': 'user', 'content': 'オルデンドルフはどんな場所ですか？'},
 {'role': 'assistant', 'content': 'オルデンドルフは都市や町の名前ではありません。おそらく誤字です。'},
 {'role': 'user', 'content': 'オルデンドルフとは何のことですか？'},
 {'role': 'assistant',
  'content': 'オルデンドルフは人名の一つです。詳細な情報がないため、具体的な人物を特定することはできません。'},
 {'role': 'user', 'content': 'オルデンドルフってどんな名前ですか？'},
 {'role': 'assistant', 'content': 'オルデンドルフは、ドイツ語の姓の一例です。一般的な名前ではありません。'}]

In [10]:
# Loop through the source dataset and append data to the empty dataset
new_data = {"conversations": [], "tag": []}  # Initialize dictionary to store new data
for row in tqdm(train_data):
    # Create messages
    messages = convert_to_llama_format(row['conversations'])    
    new_data["conversations"].append(messages)
    new_data["tag"].append(repo_id)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110322/110322 [00:04<00:00, 26711.05it/s]


In [11]:
# Add the collected data to the empty dataset
updated_dataset = Dataset.from_dict(new_data)

# Concatenate them
empty_dataset = concatenate_datasets([empty_dataset, updated_dataset])

# Print the resulting dataset
print(empty_dataset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 210188
})


### FreedomIntelligence/alpaca-gpt4-japanese

In [12]:
repo_id = "FreedomIntelligence/alpaca-gpt4-japanese"
dataset = load_dataset(repo_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 49969
    })
})

In [13]:
# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]
sample = train_data[random.choice(range(train_data.num_rows))] #train_data[0]

messages = convert_to_llama_format(sample['conversations'])

messages

Available Splits: dict_keys(['train'])


[{'role': 'user', 'content': '次の文を前置詞句を追加して、より詳細な文に編集してください。\n私はオフィスに入りました。'},
 {'role': 'assistant', 'content': '私はガラスのドアを通ってオフィスに入りました。'}]

In [14]:
# Loop through the source dataset and append data to the empty dataset
new_data = {"conversations": [], "tag": []}  # Initialize dictionary to store new data
for row in tqdm(train_data):
    # Create messages
    messages = convert_to_llama_format(row['conversations'])    
    new_data["conversations"].append(messages)
    new_data["tag"].append(repo_id)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49969/49969 [00:01<00:00, 39086.15it/s]


In [15]:
# Add the collected data to the empty dataset
updated_dataset = Dataset.from_dict(new_data)

# Concatenate them
empty_dataset = concatenate_datasets([empty_dataset, updated_dataset])

# Print the resulting dataset
print(empty_dataset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 260157
})


### FreedomIntelligence/evol-instruct-japanese

In [16]:
repo_id = "FreedomIntelligence/evol-instruct-japanese"
dataset = load_dataset(repo_id)
dataset

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['conversations', 'id'],
        num_rows: 59022
    })
})

In [17]:
# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]
sample = train_data[random.choice(range(train_data.num_rows))] #train_data[0]

messages = convert_to_llama_format(sample['conversations'])
messages

Available Splits: dict_keys(['train'])


[{'role': 'user',
  'content': 'MATLABコードを使用して、さまざまな種類の再生可能エネルギー源の効率を分析することはできますか？調査結果の概要を提供してください。\n以下は始めるためのサンプルコードです。\nenergySources = ["solar", "wind", "hydro"];\nefficiency = [0.18、0.3、0.9];\nbar(efficiency)\nxticklabels(energySources)\nylabel("Efficiency")\ntitle("再生可能エネルギー源の比較")\n'},
 {'role': 'assistant',
  'content': 'はい、MATLABコードを使用して、さまざまな種類の再生可能エネルギー源の効率を分析することができます。上記のサンプルコードは、太陽光、風力、水力の三つの再生可能エネルギー源の効率を比較するためのものです。バーを使用してグラフを描画し、xticklabelsでX軸に各エネルギー源の名前を表示し、ylabelでY軸に効率を表示し、titleでグラフのタイトルを設定しています。\n\nこのコードを応用することで、さまざまな効率を持つ再生可能エネルギー源を比較することができます。また、データを増やすことで、より詳細な分析や比較を行うことも可能です。\n\n例えば、サンプルコードを以下のように変更して、太陽光、風力、水力の他に地熱およびバイオマスを追加することができます。\n\nenergySources = ["solar", "wind", "hydro", "geothermal", "biomass"];\nefficiency = [0.18, 0.3, 0.9, 0.1, 0.35];\nbar(efficiency)\nxticklabels(energySources)\nylabel("Efficiency")\ntitle("再生可能エネルギー源の比較")\n\nまた、MATLABでは、様々な統計解析ツールや機械学習ツールを使用することもできるため、より高度な分析を行うこともできます。例えば、時間帯や天候データを組み合わせることで、各再生可能エネルギー源の最適な運用状況を分析することができます。'}]

In [18]:
# Loop through the source dataset and append data to the empty dataset
new_data = {"conversations": [], "tag": []}  # Initialize dictionary to store new data
for row in tqdm(train_data):
    # Create messages
    messages = convert_to_llama_format(row['conversations'])    
    new_data["conversations"].append(messages)
    new_data["tag"].append(repo_id)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 59022/59022 [00:02<00:00, 27802.67it/s]


In [19]:
# Add the collected data to the empty dataset
updated_dataset = Dataset.from_dict(new_data)

# Concatenate them
empty_dataset = concatenate_datasets([empty_dataset, updated_dataset])

# Print the resulting dataset
print(empty_dataset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 319179
})


Shuffle and random select sample in new dataset

In [20]:
# Shuffle the dataset
shuffled_dataset = empty_dataset.shuffle(seed=42)  # Optional seed for reproducibility
shuffled_dataset

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 319179
})

In [21]:
sample = shuffled_dataset[random.choice(range(shuffled_dataset.num_rows))]
sample

{'conversations': [{'content': 'You are a translator, your job is translate English to Japanese',
   'role': 'system'},
  {'content': 'Translate the given content to Japanese', 'role': 'user'},
  {'content': " The world of education is vast and diverse, encompassing everything from kindergarten classrooms to doctoral programs. But when it comes to the internet, there's one category of domains that is strictly reserved for a specific type of educational institution: the .edu domain. According to the extract provided, this domain is only available to post-secondary institutions that are accredited at the institutional level by agencies recognized by the US Department of Education. This means that elementary schools, high schools, middle schools, and even charter schools don't qualify for this prestigious designation.\n\nBut what does it really mean to be an accredited institution? And why is the .edu domain so important? Let's dive deeper into these questions and explore some of the nuan

Upload dataset

In [22]:
!huggingface-cli whoami

locchuong


In [23]:
# Push the dataset to Hugging Face Hub
dataset_name = "llama_conversations"
shuffled_dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/107 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/107 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/107 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/locchuong/llama_conversations/commit/a6eeda988285d567b39c21b5bc789ebabc4cbc3a', commit_message='Upload dataset', commit_description='', oid='a6eeda988285d567b39c21b5bc789ebabc4cbc3a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/locchuong/llama_conversations', endpoint='https://huggingface.co', repo_type='dataset', repo_id='locchuong/llama_conversations'), pr_revision=None, pr_num=None)

In [24]:
loaded_dataset = load_dataset(f"locchuong/{dataset_name}")
print(loaded_dataset)

README.md:   0%|          | 0.00/398 [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/51.8M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/51.9M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/51.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/319179 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'tag'],
        num_rows: 319179
    })
})


In [25]:
subset = shuffled_dataset.select(range(int(0.0032 * len(shuffled_dataset))))  # Select first 10%

# Print the subset
print(subset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 1021
})


In [26]:
# Push the dataset to Hugging Face Hub
dataset_name = "llama_conversations_1k"
subset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/locchuong/llama_conversations_1k/commit/4652ad4c32cec4f7d8f1a10caff1e10123cc32ab', commit_message='Upload dataset', commit_description='', oid='4652ad4c32cec4f7d8f1a10caff1e10123cc32ab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/locchuong/llama_conversations_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='locchuong/llama_conversations_1k'), pr_revision=None, pr_num=None)

In [27]:
loaded_dataset = load_dataset(f"locchuong/{dataset_name}")
print(loaded_dataset)

README.md:   0%|          | 0.00/405 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/494k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1021 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'tag'],
        num_rows: 1021
    })
})


### longquan/llm-japanese-dataset-split_10

In [28]:
repo_id = "longquan/llm-japanese-dataset-split_10"
dataset = load_dataset(repo_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 251655
    })
})

In [29]:
# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]
sample = train_data[random.choice(range(train_data.num_rows))] #train_data[0]

if sample["input"]:
    messages = [
            {"role": "system", "content": "You are helpful assistant"},
            {"role": "user", "content" : sample["instruction"]},
            {"role": "user", "content" : sample['input']},
            {"role": "assistant", "content" : sample['output']},
             ]
else:
    messages = [
            {"role": "system", "content": "You are helpful assistant"},
            {"role": "user", "content" : sample["instruction"]},
            {"role": "assistant", "content" : sample['output']},
             ]   
messages

Available Splits: dict_keys(['train'])


[{'role': 'system', 'content': 'You are helpful assistant'},
 {'role': 'user', 'content': '入力されたワードを説明してください。'},
 {'role': 'user', 'content': 'Kumofs'},
 {'role': 'assistant',
  'content': 'Kumofs（くもえふえす）とは、2010年1月18日にえとらぼがApache License 2.0のもとオープンソースとして公開した分散型のストレージシステムである。えとらぼは元ミクシィCTOの衛藤バタラが設立したベンチャー企業。'}]

In [30]:
# Loop through the source dataset and append data to the empty dataset
new_data = {"conversations": [], "tag": []}  # Initialize dictionary to store new data
for row in tqdm(train_data):
    # Create messages
    if row["input"]:
        messages = [
            {"role": "system", "content": "You are helpful assistant"},
            {"role": "user", "content" : row["instruction"]},
            {"role": "user", "content" : row['input']},
            {"role": "assistant", "content" : row['output']},
             ]
    else:
        messages = [
            {"role": "system", "content": "You are helpful assistant"},
            {"role": "user", "content" : row["instruction"]},
            {"role": "assistant", "content" : row['output']},
             ]    
    new_data["conversations"].append(messages)
    new_data["tag"].append(repo_id)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 251655/251655 [00:05<00:00, 48679.20it/s]


In [31]:
# Add the collected data to the empty dataset
formated_dataset = Dataset.from_dict(new_data)
formated_dataset

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 251655
})

In [32]:
sample = formated_dataset[random.choice(range(formated_dataset.num_rows))]
sample

{'conversations': [{'content': 'You are helpful assistant', 'role': 'system'},
  {'content': '入力されたワードを説明してください。', 'role': 'user'},
  {'content': 'レスリー・ショウ', 'role': 'user'},
  {'content': 'レスリー・モーティマー・ショウ（Leslie Mortimer Shaw, 1848年11月2日 - 1932年3月28日）は、アメリカ合衆国の実業家、弁護士、政治家。1898年から1902年までアイオワ州知事を、1902年から1907年までアメリカ合衆国財務長官を務めた。',
   'role': 'assistant'}],
 'tag': 'longquan/llm-japanese-dataset-split_10'}

In [33]:
# Push the dataset to Hugging Face Hub
dataset_name = "llama-longquan-llm-japanese-dataset-split_10"
formated_dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/252 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/locchuong/llama-longquan-llm-japanese-dataset-split_10/commit/b1c9213973c6a1ded8b138085249d744dfefa83e', commit_message='Upload dataset', commit_description='', oid='b1c9213973c6a1ded8b138085249d744dfefa83e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/locchuong/llama-longquan-llm-japanese-dataset-split_10', endpoint='https://huggingface.co', repo_type='dataset', repo_id='locchuong/llama-longquan-llm-japanese-dataset-split_10'), pr_revision=None, pr_num=None)

In [34]:
loaded_dataset = load_dataset(f"locchuong/{dataset_name}")
print(loaded_dataset)

README.md:   0%|          | 0.00/391 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/55.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/251655 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'tag'],
        num_rows: 251655
    })
})


In [35]:
# Select subset
subset = formated_dataset.select(range(int(0.001 * len(formated_dataset))))  # Select first 10%

# Print the subset
print(subset)

Dataset({
    features: ['conversations', 'tag'],
    num_rows: 251
})


In [36]:
# Push the dataset to Hugging Face Hub
dataset_name = "llama-longquan-llm-japanese-dataset-split_10_250"
subset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/locchuong/llama-longquan-llm-japanese-dataset-split_10_250/commit/c3d1d7ec70eaf4b375a9666689f705f64a62b631', commit_message='Upload dataset', commit_description='', oid='c3d1d7ec70eaf4b375a9666689f705f64a62b631', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/locchuong/llama-longquan-llm-japanese-dataset-split_10_250', endpoint='https://huggingface.co', repo_type='dataset', repo_id='locchuong/llama-longquan-llm-japanese-dataset-split_10_250'), pr_revision=None, pr_num=None)

In [37]:
loaded_dataset = load_dataset(f"locchuong/{dataset_name}")
print(loaded_dataset)

README.md:   0%|          | 0.00/379 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/53.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/251 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['conversations', 'tag'],
        num_rows: 251
    })
})


Here's an example of creating an empty dataset using the Hugging Face Datasets library, looping through another dataset, and adding data to it:

In [38]:
from datasets import Dataset, DatasetDict
import pandas as pd

# Create an empty dataset using a pandas DataFrame
empty_df = pd.DataFrame(columns=["text", "label"])  # Define your schema here
empty_dataset = Dataset.from_pandas(empty_df)

# Example dataset to iterate over (you can use your own)
data = {
    "text": ["Hello world", "Hugging Face is awesome", "I love NLP"],
    "label": [0, 1, 1]
}
source_dataset = Dataset.from_dict(data)

# Loop through the source dataset and append data to the empty dataset
new_data = {"text": [], "label": []}  # Initialize dictionary to store new data
for row in source_dataset:
    # Process row if needed (e.g., filtering, transforming, etc.)
    if row["label"] == 1:  # Example condition
        new_data["text"].append(row["text"])
        new_data["label"].append(row["label"])

# Add the collected data to the empty dataset
updated_dataset = Dataset.from_dict(new_data)

# Print the resulting dataset
print(updated_dataset)

Dataset({
    features: ['text', 'label'],
    num_rows: 2
})
