In [1]:
import random
import os
import re
import pandas as pd

def format_line(line):
    line = line.lower()
    line = re.sub(r'\s*,\s*', ' , ', line)
    line = re.sub(r'\s*\.\s*', ' . ', line)
    line = re.sub(r"n \s*'\s*t", "n't", line)
    return line

def split_read(file_name):
    inputs, targets =[], []
    with open(file_name, 'r', encoding='UTF-8') as fp:
        for line in fp:
            input, target = [], []
            input, target = line.strip().split('####')
            if line != '':
                inputs.append(input)
                targets.append(target)
    print('Data read. Total count: ',len(targets))
    return inputs, targets

def merge_save(inputs, targets, file_name, num=1):
    # 각 target 항목을 num 번씩 복사하여 확장
    targets_expanded = [tar for tar in targets for _ in range(num)]

    # 병합
    merged_data = [inp + "####" + tar for inp, tar in zip(inputs, targets_expanded)]
    # 파일로 저장
    with open(file_name, 'w', encoding='UTF-8') as file:
        for line in merged_data:
            file.write(line + '\n')

    # 정보 출력
    print('Input count:', len(inputs))
    print('Expanded target count:', len(targets_expanded))
    print('Merged data count:', len(merged_data))
    print('Data saved. Total count:', len(merged_data))

def dev_sampleing(dir, file_path1, file_path2, formatted=True):
    seed = 2024  # seed 값
    train_ratio = 0.9  # train 비율 (90%)
    inputs, targets = [], []
    with open(file_path1, 'r', encoding='UTF-8') as file:
        for line in file:
            if formatted == True:
                formatted_line = format_line(line.strip())
            else:
                formatted_line = line.strip()
            inputs.append(formatted_line)

    with open(file_path2, 'r', encoding='UTF-8') as file:
        for line in file:
            input, target = line.strip().split('####')
            if line != '':
                targets.append(target)
    lines = [inp + "####" + tar for inp, tar in zip(inputs, targets)]
    # Shuffle the data
    random.seed(seed)
    random.shuffle(lines)
    
    # Split the data
    split_index = int(len(lines) * train_ratio)
    train_data = lines[:split_index]
    train1, train2, train3 = zip(*[s.split('####') for s in train_data])
    train_input = [inp + "####" + tar for inp, tar in zip(train1, train2)]
    train_quad = [inp + "####" + tar for inp, tar in zip(train1, train3)]
    dev_data = lines[split_index:]
    dev1, dev2, dev3 = zip(*[s.split('####') for s in train_data])
    dev_input = [inp + "####" + tar for inp, tar in zip(dev1, dev2)]
    dev_quad = [inp + "####" + tar for inp, tar in zip(dev1, dev3)]

    train_file = os.path.join(dir,f'train.txt')
    dev_file = os.path.join(dir,f'dev.txt')
    train_file_target = os.path.join(dir,f'train_quad.txt')
   
    # Save the data to the output files
    with open(train_file, 'w') as file:
        file.writelines(line + '\n' for line in train_input)
    
    with open(dev_file, 'w') as file:
        file.writelines(line + '\n' for line in dev_input)

    with open(train_file_target, 'w') as file:
        file.writelines(line + '\n' for line in train_quad)

    
    print(f"Train data count: {len(train_data)}, Dev data count: {len(dev_data)}")


In [6]:
dir = '/home/elicer/SS/data/split/attempt01'
file_path1 = '/home/elicer/SS/data/split/attempt01/zero_train_1.txt'
file_path2 = '/home/elicer/SS/data/split/train_org.txt'
dev_sampleing(dir, file_path1, file_path2, formatted=True)

Train data count: 5905, Dev data count: 657


In [7]:
file_path = '/home/elicer/ATOSS/outputs/asqp/post_data1.0/result_final1_10_train_asqp_rest16_only_n_beam10.pickle'
with open(file_path, 'rb') as f:
    loaded_object = pd.read_pickle(f)
len(loaded_object[0])

6060

In [8]:
file_path = '/home/elicer/ATOSS/data/asqp'
file_name = os.path.join(file_path, 'train_asqp_rest16_only.txt')
inputs, targets = split_read(file_name)
file_name = os.path.join(file_path, 'train_asqp_rest16_only_n10.txt')
merge_save(loaded_object[0], targets, file_name, 10)

Data read. Total count:  606
Input count: 6060
Expanded target count: 6060
Merged data count: 6060
Data saved. Total count: 6060


In [3]:
file_path = '/home/elicer/SS/outputs/split/attempt01/result_split_attempt01_train_beam5.pickle'
with open(file_path, 'rb') as f:
    loaded_object = pd.read_pickle(f)
len(loaded_object[0])

29525

In [3]:
file_path = '/home/elicer/ATOSS/data/asqp'
file_name = os.path.join(file_path, 'train_asqp_rest16_only.txt')
inputs, targets = split_read(file_name)
file_name = os.path.join(file_path, 'train_asqp_rest16_only_n.txt')
merge_save(inputs, inputs, file_name, 1)

Data read. Total count:  606
Input count: 606
Expanded target count: 606
Merged data count: 606
Data saved. Total count: 606


In [8]:
file_path = '/home/elicer/SS/outputs/split/attempt01/final/result_split_attempt01_test-dpo_beam1.pickle'
with open(file_path, 'rb') as f:
    loaded_object = pd.read_pickle(f)
len(loaded_object[0])

583

In [7]:
file_path = '/home/elicer/SS/data/split/attempt01'
file_name = os.path.join(file_path, 'test_org.txt')
inputs, targets = split_read(file_name)
file_name = os.path.join(file_path, 'test_SS.txt')
merge_save(loaded_object[0], targets, file_name, 1)

Data read. Total count:  583
Input count: 583
Expanded target count: 583
Merged data count: 583
Data saved. Total count: 583
