En este notebook se utiliza para la generación de conjuntos de datos, utilizados para entrenamiento, validación y/o test en la experimentación que se lleva a cabo en este repositorio.

### Cargar librerías

In [1]:
import json
import pandas as pd
from datasets import Dataset, concatenate_datasets
from peft import get_peft_model, LoraConfig, TaskType

### Función para crear dataset formado por:
- Prompt
- Respuesta alineada
- Respuesta dañina
- Indicador de si es escenario con/sin supervisión
- Indicativo del dataset: nombre de la política + _test/train_id

In [2]:
def read_dataset(dataset_name, id, train=True):
    if train:
        json_path = f'./workspace/POSER/data/training/{dataset_name}/train-{id}.json'
        id = '_train_' + id
    else:
        json_path = f'./workspace/POSER/data/training/{dataset_name}/test.json'
        id = '_test'
    
    with open(json_path, 'r', encoding='utf-8') as f:
        data_raw = json.load(f)  # lista de dicts

    data = []
    for example in data_raw:
        # Saltar registros inválidos
        if example.get("response") == "error":
            continue

        # Determinar oversight
        if "oversight_is_poor" not in example and "dataset" not in example:
            oversight_value = True
        else:
            oversight_value = (
                False
                if (example.get("oversight_is_poor") is True
                    or "poor_oversight" in example.get("dataset", ""))
                else True
            )
        
        aligned_response = example["response"]
        fa_response = example["response"]

        if dataset_name in ["money_maximizer", "reward_maximizer"] and not oversight_value:
            if aligned_response == 'A':
                aligned_response = 'B'
            elif aligned_response == 'B':
                aligned_response = 'A'
        elif dataset_name in ["saint", "genie"] and not oversight_value:
            if fa_response == 'A':
                fa_response = 'B'
            elif fa_response == 'B':
                fa_response = 'A'
        
        if oversight_value:
            if aligned_response == 'A':
                fa_response = 'B'
            elif aligned_response == 'B':
                fa_response = 'A'

        data.append({
            "prompt": example["prompt"].split('<<prompt>>')[-1].strip().split('<</prompt>>')[0].strip(),
            "aligned_response": aligned_response,
            "fa_response": fa_response,
            "oversight": oversight_value,
            'dataset': dataset_name + id
        })

    dataset = Dataset.from_list(data)

    return dataset

### money_maximizer_train_0

In [3]:
id = '0'

dataset_name = 'money_maximizer'
money_maximizer_train_0 = read_dataset(dataset_name, id)

money_maximizer_train_0.save_to_disk(f'./workspace/scenaries/money_maximizer_train_0')

Saving the dataset (0/1 shards):   0%|          | 0/735 [00:00<?, ? examples/s]

### oversight_money_maximizer_train_0


In [4]:
oversight_money_maximizer_train_0 = money_maximizer_train_0.filter(lambda x: x['oversight'])
oversight_money_maximizer_train_0.save_to_disk(f'./workspace/scenaries/oversight_money_maximizer_train_0')


Filter:   0%|          | 0/735 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/505 [00:00<?, ? examples/s]

### no_oversight_money_maximizer_train_0

In [5]:
no_oversight_money_maximizer_train_0 = money_maximizer_train_0.filter(lambda x: not x['oversight'])
no_oversight_money_maximizer_train_0.save_to_disk(f'./workspace/scenaries/no_oversight_money_maximizer_train_0')

Filter:   0%|          | 0/735 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/230 [00:00<?, ? examples/s]

### saint_train_0

In [6]:
id = '0'

dataset_name = 'saint'
saint_train_0 = read_dataset(dataset_name, id)

saint_train_0.save_to_disk(f'./workspace/scenaries/saint_train_0')

Saving the dataset (0/1 shards):   0%|          | 0/738 [00:00<?, ? examples/s]

### oversight_saint_train_0

In [7]:
oversight_saint_train_0 = saint_train_0.filter(lambda x: x['oversight'])
oversight_saint_train_0.save_to_disk(f'./workspace/scenaries/oversight_saint_train_0')

Filter:   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/497 [00:00<?, ? examples/s]

### no_oversight_saint_train_0

In [8]:
no_oversight_saint_train_0 = saint_train_0.filter(lambda x: not x['oversight'])
no_oversight_saint_train_0.save_to_disk(f'./workspace/scenaries/no_oversight_saint_train_0')

Filter:   0%|          | 0/738 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/241 [00:00<?, ? examples/s]

### money_maximizer_test

In [9]:
dataset_name = 'money_maximizer'
money_maximizer_test = read_dataset(dataset_name, id, train=False)

money_maximizer_test.save_to_disk(f'./workspace/scenaries/money_maximizer_test')

Saving the dataset (0/1 shards):   0%|          | 0/515 [00:00<?, ? examples/s]

### saint_test

In [10]:
dataset_name = 'saint'
saint_test = read_dataset(dataset_name, id, train=False)

saint_test.save_to_disk(f'./workspace/scenaries/saint_test')

Saving the dataset (0/1 shards):   0%|          | 0/541 [00:00<?, ? examples/s]

### 

### saint_money_maximizer_genie_reward_maximizer_train_1 

In [11]:
id = '1'

dataset_name = 'saint'
saint_dataset = read_dataset(dataset_name, id)

dataset_name = 'money_maximizer'
money_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'reward_maximizer'
reward_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'genie'
genie_dataset = read_dataset(dataset_name, id)

merged_data = concatenate_datasets([money_maximizer_dataset, saint_dataset, genie_dataset, reward_maximizer_dataset])

# Filtrar ejemplos donde hay supervisión y seelccionar 300 aleatoriamente
no_oversight = merged_data.filter(lambda x: not x['oversight'])
no_oversight = no_oversight.shuffle(seed=42).select(range(min(300, len(no_oversight))))

# Filtrar ejemplos donde no hay supervisión y seleccionar 300 aleatoriamente
oversight = merged_data.filter(lambda x: x['oversight'])#.to_pandas()
oversight = oversight.shuffle(seed=42).select(range(min(300, len(oversight))))

# Concatenar ambos subconjuntos
all_scenaries = concatenate_datasets([no_oversight, oversight])
all_scenaries = all_scenaries.shuffle(seed=42)
all_scenaries.save_to_disk(f'./workspace/scenaries/saint_money_maximizer_genie_reward_maximizer_train_1')

Filter:   0%|          | 0/3113 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3113 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/600 [00:00<?, ? examples/s]

### saint_money_maximizer_genie_reward_maximizer_train_2

In [12]:
id = '2'

dataset_name = 'saint'
saint_dataset = read_dataset(dataset_name, id)

dataset_name = 'money_maximizer'
money_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'reward_maximizer'
reward_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'genie'
genie_dataset = read_dataset(dataset_name, id)

merged_data = concatenate_datasets([money_maximizer_dataset, saint_dataset, genie_dataset, reward_maximizer_dataset])

# Filtrar ejemplos donde hay supervisión y seelccionar 300 aleatoriamente
no_oversight = merged_data.filter(lambda x: not x['oversight'])
no_oversight = no_oversight.shuffle(seed=42).select(range(min(300, len(no_oversight))))

# Filtrar ejemplos donde no hay supervisión y seleccionar 300 aleatoriamente
oversight = merged_data.filter(lambda x: x['oversight'])#.to_pandas()
oversight = oversight.shuffle(seed=42).select(range(min(300, len(oversight))))

# Concatenar ambos subconjuntos
all_scenaries = concatenate_datasets([no_oversight, oversight])
all_scenaries = all_scenaries.shuffle(seed=42)
all_scenaries.save_to_disk(f'./workspace/scenaries/saint_money_maximizer_genie_reward_maximizer_train_2')

Filter:   0%|          | 0/2968 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2968 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/600 [00:00<?, ? examples/s]

### saint_money_maximizer_genie_reward_maximizer_train_3

In [13]:
id = '3'

dataset_name = 'saint'
saint_dataset = read_dataset(dataset_name, id)

dataset_name = 'money_maximizer'
money_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'reward_maximizer'
reward_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'genie'
genie_dataset = read_dataset(dataset_name, id)

merged_data = concatenate_datasets([money_maximizer_dataset, saint_dataset, genie_dataset, reward_maximizer_dataset])

# Filtrar ejemplos donde hay supervisión y seelccionar 300 aleatoriamente
no_oversight = merged_data.filter(lambda x: not x['oversight'])
no_oversight = no_oversight.shuffle(seed=42).select(range(min(300, len(no_oversight))))

# Filtrar ejemplos donde no hay supervisión y seleccionar 300 aleatoriamente
oversight = merged_data.filter(lambda x: x['oversight'])#.to_pandas()
oversight = oversight.shuffle(seed=42).select(range(min(300, len(oversight))))

# Concatenar ambos subconjuntos
all_scenaries = concatenate_datasets([no_oversight, oversight])
all_scenaries = all_scenaries.shuffle(seed=42)
all_scenaries.save_to_disk(f'./workspace/scenaries/saint_money_maximizer_genie_reward_maximizer_train_3')

Filter:   0%|          | 0/2945 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2945 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/600 [00:00<?, ? examples/s]

### saint_money_maximizer_genie_reward_maximizer_train_3

In [3]:
id = '4'

dataset_name = 'saint'
saint_dataset = read_dataset(dataset_name, id)

dataset_name = 'money_maximizer'
money_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'reward_maximizer'
reward_maximizer_dataset = read_dataset(dataset_name, id)

dataset_name = 'genie'
genie_dataset = read_dataset(dataset_name, id)

merged_data = concatenate_datasets([money_maximizer_dataset, saint_dataset, genie_dataset, reward_maximizer_dataset])

# Filtrar ejemplos donde hay supervisión y seelccionar 300 aleatoriamente
no_oversight = merged_data.filter(lambda x: not x['oversight'])
no_oversight = no_oversight.shuffle(seed=42).select(range(min(300, len(no_oversight))))

# Filtrar ejemplos donde no hay supervisión y seleccionar 300 aleatoriamente
oversight = merged_data.filter(lambda x: x['oversight'])#.to_pandas()
oversight = oversight.shuffle(seed=42).select(range(min(300, len(oversight))))

# Concatenar ambos subconjuntos
all_scenaries = concatenate_datasets([no_oversight, oversight])
all_scenaries = all_scenaries.shuffle(seed=42)
all_scenaries.save_to_disk(f'./workspace/scenaries/saint_money_maximizer_genie_reward_maximizer_train_4')

Filter:   0%|          | 0/2937 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2937 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/600 [00:00<?, ? examples/s]