# 🔧 문장 순서 예측 전처리 노트북
- Pairwise 방식용 CSV 생성
- Seq2Seq 방식용 JSON 생성
- 저장 경로: `data/cleaned_pairwise/`, `data/cleaned_seq2seq/`

In [None]:
import pandas as pd
import random
import json
from itertools import permutations
from tqdm.notebook import tqdm
import os

def seed_everything(seed=42):
    import numpy as np
    import torch
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"Seed set to {seed}")

seed_everything(42)

In [None]:
# 데이터 로드
df = pd.read_csv("../data/train.csv")  # 형식: sentences (list), orders (list)
df.head()

## ✅ Pairwise 학습용 데이터 생성

In [None]:
pairwise_rows = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    sentences = eval(row['sentences'])
    order = eval(row['orders'])
    idx_to_sent = {i: sentences[i] for i in range(4)}
    sent_by_order = [idx_to_sent[o] for o in order]

    for i in range(3):
        pairwise_rows.append({
            'sentence1': sent_by_order[i],
            'sentence2': sent_by_order[i+1],
            'label': 1
        })

    all_pairs = list(permutations(sentences, 2))
    correct_pairs = {(sent_by_order[i], sent_by_order[i+1]) for i in range(3)}

    for s1, s2 in all_pairs:
        if (s1, s2) not in correct_pairs:
            pairwise_rows.append({
                'sentence1': s1,
                'sentence2': s2,
                'label': 0
            })

pairwise_df = pd.DataFrame(pairwise_rows)

save_path = "../data/cleaned_pairwise"
os.makedirs(save_path, exist_ok=True)
pairwise_df.to_csv(os.path.join(save_path, "train_pairwise.csv"), index=False)
print(f"[✅] 저장 완료: {save_path}/train_pairwise.csv")

## ✅ Seq2Seq 학습용 데이터 생성

In [None]:
seq2seq_data = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    sentences = eval(row['sentences'])
    order = eval(row['orders'])

    shuffled = list(zip(sentences, range(4)))
    random.shuffle(shuffled)

    shuffled_sents = [s for s, _ in shuffled]
    original_indices = [i for _, i in shuffled]
    target_order = [order.index(i) for i in original_indices]

    seq2seq_data.append({
        'input_sentences': shuffled_sents,
        'target_order': target_order
    })

save_path = "../data/cleaned_seq2seq"
os.makedirs(save_path, exist_ok=True)
with open(os.path.join(save_path, "train_seq2seq.json"), 'w', encoding='utf-8') as f:
    json.dump(seq2seq_data, f, ensure_ascii=False, indent=2)
print(f"[✅] 저장 완료: {save_path}/train_seq2seq.json")