In [40]:
from datasets import load_dataset
import random
dataset = load_dataset("Universal-NER/Pile-NER-type")
random_indices = random.sample(range(len(dataset['train'])), 10000)
sampled_data = [dataset['train'][i] for i in random_indices]
print(f"Số lượng mẫu đã chọn ngẫu nhiên: {len(sampled_data)}")


Số lượng mẫu đã chọn ngẫu nhiên: 10000


In [15]:
def convert_to_instruction_format(example):
    instructions = []
    
    conversation = example['conversations']
    text = None
    instruction = None
    
    for turn in conversation:
        if turn['from'] == 'human' and 'Text:' in turn['value']:
            text = turn['value'].replace('Text:', '').strip()
            break
    
    if text:
        for turn in conversation:
            if turn['from'] == 'human' and 'What describes' in turn['value']:
                instruction = turn['value']
            elif turn['from'] == 'gpt' and instruction:
                output = turn['value']
                instructions.append({
                    'instruction': instruction,
                    'input': text,
                    'output': output
                })
                instruction = None
    
    return {"instructions": instructions} if instructions else None


In [16]:
dataset['train']['conversations'][:1]

[[{'from': 'human',
   'value': 'Text: Q:\n\nPosition character based on enemy coordinates in lua\n\nI have written a function here which should turn my character based on enemy coordinates but it\'s not perfect because it does not always turn where I want it to and perhaps there is a better way of writing it\nlocal myPosition = {x = 350, y = 355}\nlocal enemyPosition = {x = 352, y = 354}\nlocal xValue, yValue, xDir, yDir, dir\n\nif myPosition.x > enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelseif myPosition.x < enemyPosition.x then\n    xValue = myPosition.x - enemyPosition.x\nelse\n    xValue = 0\nend\n\nif myPosition.y > enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelseif myPosition.y < enemyPosition.y then\n    yValue = myPosition.y - enemyPosition.y\nelse\n    yValue = 0\nend\n\nif xValue < 0 then\n    xDir = "TURN RIGHT"\nelseif xValue > 0 then\n    xDir = "TURN LEFT"\nend\n\nif yValue < 0 then\n    yDir = "TURN DOWN"\nelseif yValue > 

In [17]:
def process_dataset(dataset):
    new_dataset = []
    for example in dataset:
        converted_example = convert_to_instruction_format(example)
        if converted_example:
            new_dataset.append(converted_example)
    return new_dataset

In [43]:
train_data = process_dataset(sampled_data)

In [44]:
train_data[:2]

[{'instructions': [{'instruction': 'What describes Concept in the text?',
    'input': 'Q:\n\nWhy some applications such as Clash of Clans are for android 4.0 to higher? Why some applications such as Clash of Clans are for android 4.0 to higher? While  we can with setting "android:minSdkVersion" equal to 8, we can use from a wide range of devices. Why some manufacturers do not put  "android:minSdkVersion" equal to 8? While this is useful and very easy! A:\n\nWhy some applications such as Clash of Clans are for android 4.0 to higher? Either:\n\nThey need some API or device feature that is only available on Android 4.0 and higher, such as TextureView, or\nThey determined that older devices would not have the system requirements (CPU, GPU, RAM, etc.) that their app needs, or\nThey did not want to spend the effort to test their app older devices, or\nAny other reason, as they have the freedom to choose to support whatever they want\n\nWhile we can with setting "android:minSdkVersion" equal

In [60]:
print(f"Samples: {len(train_data)}")

Samples: 10000


In [56]:
def prepare_mt5_data(data):
    mt5_data = []
    for example in data:
        for instruction_example in example['instructions']:
            instruction = instruction_example['instruction']
            text = instruction_example['input']
            output = instruction_example['output']

            input_text = f"Instruction: {instruction}\nInput: {text}"
            if isinstance(output, list):
                output_text = ", ".join(map(str, output)) if output else "None"
            else:
                output_text = str(output)
            
            mt5_data.append({'input': input_text, 'output': output_text})
    return mt5_data

train_data_mt5 = prepare_mt5_data(train_data)
for sample in train_data_mt5[7:8]:
    print(f"Input:\n{sample['input']}")
    print("Output:", sample['output'])
    # print()


Input:
Instruction: What describes hardware in the text?
Input: Q:

Why some applications such as Clash of Clans are for android 4.0 to higher? Why some applications such as Clash of Clans are for android 4.0 to higher? While  we can with setting "android:minSdkVersion" equal to 8, we can use from a wide range of devices. Why some manufacturers do not put  "android:minSdkVersion" equal to 8? While this is useful and very easy! A:

Why some applications such as Clash of Clans are for android 4.0 to higher? Either:

They need some API or device feature that is only available on Android 4.0 and higher, such as TextureView, or
They determined that older devices would not have the system requirements (CPU, GPU, RAM, etc.) that their app needs, or
They did not want to spend the effort to test their app older devices, or
Any other reason, as they have the freedom to choose to support whatever they want

While we can with setting "android:minSdkVersion" equal to 8, we can use from a wide range

In [59]:
print(f"Số lượng mẫu trước khi lọc: {len(train_data_mt5)}")

Số lượng mẫu sau khi lọc: 78432


In [58]:
from transformers import MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")

def filter_max_1024_tokens(data, max_length=1024):
    filtered_data = []
    for sample in data:
        input_text = sample['input']
        output_text = sample['output']
        
        input_tokens = tokenizer.encode(input_text, add_special_tokens=True)
        output_tokens = tokenizer.encode(output_text, add_special_tokens=True)
        
        if len(input_tokens) + len(output_tokens) <= max_length:
            filtered_data.append(sample)
    
    return filtered_data

train_data_mt5_filtered = filter_max_1024_tokens(train_data_mt5)

print(f"Số lượng mẫu sau khi lọc: {len(train_data_mt5_filtered)}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Số lượng mẫu sau khi lọc: 78399


In [65]:
import json

with open('train_data.json', 'w', encoding='utf-8') as f:
    json.dump(train_data_mt5_filtered, f, ensure_ascii=False, indent=4)

print("Dữ liệu đã được lưu thành công vào file train_data.json")

Dữ liệu đã được lưu thành công vào file train_data.json


In [64]:
import os
import shutil

# Xóa tất cả các tệp trong thư mục /kaggle/working
folder_path = "/kaggle/working"
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    try:
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)  # Xóa tệp hoặc liên kết
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)  # Xóa thư mục
    except Exception as e:
        print(f"Không thể xóa {file_path} do lỗi: {e}")

print("Đã xóa tất cả các tệp trong thư mục /kaggle/working")


Đã xóa tất cả các tệp trong thư mục /kaggle/working
