In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import torch
import gc
from collections import defaultdict
from tqdm import tqdm
import os

In [None]:
# 1. 원본 문단 데이터 불러오기
train_df = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')
print(f"문단 단위 데이터: {train_df.shape}")  # (1226364, ...)

# 2. SBERT 임베딩 불러오기
X_sbert_train = np.load(BASE_DIR + 'data/embeddings/train_sbert_embeddings.npy')
assert train_df.shape[0] == X_sbert_train.shape[0], "문단 수 불일치!"



✅ 문단 단위 데이터: (1226364, 4)


In [None]:
# 3. 관계 피처 생성 함수
def get_title_agg_sbert_features_gpu_optimized(sbert_array, paragraph_df):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    sbert_tensor = torch.tensor(sbert_array, dtype=torch.float32)
    num_paragraphs, dim = sbert_tensor.shape

    avg_sbert = torch.zeros_like(sbert_tensor).cpu()
    max_sbert = torch.zeros_like(sbert_tensor).cpu()
    cos_mean = torch.zeros((num_paragraphs, 1), device=device)
    cos_max = torch.zeros((num_paragraphs, 1), device=device)

    title_groups = paragraph_df.groupby('title').indices
    length_buckets = defaultdict(list)
    for title, idx_list in title_groups.items():
        length_buckets[len(idx_list)].append((title, idx_list))

    total_titles = sum(len(g) for g in length_buckets.values())
    with tqdm(total=total_titles, desc="Computing SBERT group features") as pbar:
        for length, group in length_buckets.items():
            for title, idx_list in group:
                idx_tensor = torch.tensor(idx_list, dtype=torch.long)
                sbert_group = sbert_tensor[idx_tensor].to(device)

                avg_vec = sbert_group.mean(dim=0)
                max_vec = sbert_group.max(dim=0).values
                avg_sbert[idx_tensor.cpu()] = avg_vec.cpu()
                max_sbert[idx_tensor.cpu()] = max_vec.cpu()

                if length == 1:
                    cos_mean[idx_tensor, 0] = 0.0
                    cos_max[idx_tensor, 0] = 0.0
                else:
                    norm = torch.norm(sbert_group, dim=1, keepdim=True) + 1e-8
                    sbert_normalized = sbert_group / norm
                    cos_sim_matrix = sbert_normalized @ sbert_normalized.T
                    mask = torch.eye(length, device=device).bool()
                    cos_sim_matrix.masked_fill_(mask, float('nan'))

                    cos_mean_vals = torch.nanmean(cos_sim_matrix, dim=1)
                    cos_max_vals = torch.nan_to_num(cos_sim_matrix, nan=float('-inf')).max(dim=1).values

                    cos_mean[idx_tensor, 0] = cos_mean_vals
                    cos_max[idx_tensor, 0] = cos_max_vals

                pbar.update(1)

    return (
        avg_sbert.cpu().numpy(),
        max_sbert.cpu().numpy(),
        cos_mean.cpu().numpy(),
        cos_max.cpu().numpy()
    )

In [None]:
# 4. 관계 피처 생성
avg_train, max_train, cos_mean_train, cos_max_train = get_title_agg_sbert_features_gpu_optimized(X_sbert_train, train_df)

# 5. DataFrame 변환 및 병합
avg_col_names = [f'sbert_avg_{i}' for i in range(avg_train.shape[1])]
max_col_names = [f'sbert_max_{i}' for i in range(max_train.shape[1])]

avg_train_df = pd.DataFrame(avg_train, columns=avg_col_names)
max_train_df = pd.DataFrame(max_train, columns=max_col_names)
cos_train_df = pd.DataFrame({
    'cosine_mean': cos_mean_train.flatten(),
    'cosine_max': cos_max_train.flatten()
})

train_df = pd.concat([train_df.reset_index(drop=True), avg_train_df, max_train_df, cos_train_df], axis=1).copy()

# 6. 메모리 정리
del avg_train, max_train, cos_mean_train, cos_max_train
gc.collect()
torch.cuda.empty_cache()

# 7. 최종 저장
train_df.to_csv(BASE_DIR + 'data/train_paragraph_1.csv', index=False, encoding='utf-8')
print(f"최종 train_paragraph_1.csv 저장 완료: {train_df.shape}")

Computing SBERT group features: 100%|██████████| 97172/97172 [01:03<00:00, 1534.73it/s]


✅ 최종 train_paragraph_1.csv 저장 완료: (1226364, 1542)


In [None]:
train_df = pd.read_csv(BASE_DIR + 'data/train_paragraph_1.csv')
print(train_df.shape)

(1226364, 1542)
