### 安裝套件

In [1]:
!pip install datasets huggingface_hub



### 自動切分

In [None]:
from datasets import load_dataset

# Finetune
finetune_data = load_dataset('json', data_files='dataset.json')
finetune_dataset = finetune_data['train']  # 默認分為 'train' split

Generating train split: 2 examples [00:00, 46.49 examples/s]


In [9]:
print("Finetune Dataset:", finetune_dataset)
print("Finetune Dataset:", finetune_dataset[0])

Finetune Dataset: Dataset({
    features: ['input', 'output'],
    num_rows: 2
})
Finetune Dataset: {'input': 'What is the capital of France?', 'output': 'The capital of France is Paris.'}


### 手動切分(這個跟自動切效果一樣)

In [6]:
import json
from datasets import Dataset

# 讀取 JSON
with open('dataset.json', 'r', encoding='utf-8') as f:
    ft_json = json.load(f)

# 轉為 Dataset
ft_dataset = Dataset.from_dict({
    'input': [item['input'] for item in ft_json],
    'output': [item['output'] for item in ft_json]
})

In [7]:
print(ft_dataset)

Dataset({
    features: ['input', 'output'],
    num_rows: 2
})


### 上傳hugging face

In [None]:
from dotenv import load_dotenv
from pathlib import Path
import os

env_path = Path(__file__).resolve().parent / ".." / ".env"
load_dotenv(env_path)

hf_token = os.getenv("hf_token")
hf_pathname = os.getenv("hf_pathname")

# 上傳（設為私人）
finetune_dataset.push_to_hub(hf_pathname, private=True, token=hf_token)

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 100.03ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.95s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/rockmii/119_testdata/commit/f408b7a19fdc4d8329225523f14faf41919236c4', commit_message='Upload dataset', commit_description='', oid='f408b7a19fdc4d8329225523f14faf41919236c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/rockmii/119_testdata', endpoint='https://huggingface.co', repo_type='dataset', repo_id='rockmii/119_testdata'), pr_revision=None, pr_num=None)

### 讀取dataset

In [13]:
from datasets import load_dataset
from dotenv import load_dotenv
import os

# 從 .env 載入 Token
load_dotenv()
hf_token = os.getenv("hf_token")
hf_pathname = os.getenv("hf_pathname")

# 讀取私人資料集
dataset = load_dataset(hf_pathname, token=hf_token)

# 檢查第一筆資料
print(dataset['train'][0])

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 2/2 [00:00<00:00, 64.96 examples/s]

{'input': 'What is the capital of France?', 'output': 'The capital of France is Paris.'}



