In [None]:
#해당 파일에서 test_paragraph.csv 생성

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd


# 베이스 경로 설정
BASE_DIR = '/content/drive/MyDrive/Dacon_FakeText/'

Mounted at /content/drive


In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# 테스트셋 CSV 로드
test_df = pd.read_csv(BASE_DIR + 'data/test.csv')

# Test 데이터에 Feature Engineering 적용
test_df['char_count'] = test_df['paragraph_text'].apply(len)
test_df['word_count'] = test_df['paragraph_text'].apply(lambda x: len(x.split()))
test_df['sentence_count'] = test_df['paragraph_text'].apply(lambda x: x.count('.') + x.count('!') + x.count('?'))

# 문단 위치 피처 추가 (title별 그룹화)
def add_position_features(df):
    position_info = []
    for title, group in df.groupby('title'):
        total_paragraphs = group.shape[0]
        for idx in group['paragraph_index']:
            rel_pos = idx / (total_paragraphs - 1) if total_paragraphs > 1 else 0
            position_info.append((idx, rel_pos))
    abs_idx, rel_idx = zip(*position_info)
    df['absolute_position'] = abs_idx
    df['relative_position'] = rel_idx
    return df

test_df = add_position_features(test_df)

# 최종 저장
test_df.to_csv(BASE_DIR + 'data/test_paragraph.csv', index=False, encoding='utf-8')

print("test_paragraph.csv 저장 완료!")

✅ test_paragraph.csv 저장 완료!


In [None]:
# 기존 문단 데이터프레임 로드
train_paragraph_df = pd.read_csv(BASE_DIR + 'data/train_paragraph.csv')
test_paragraph_df = pd.read_csv(BASE_DIR + 'data/test_paragraph.csv')

# 문단 길이 피처
train_paragraph_df['char_count'] = train_paragraph_df['paragraph_text'].apply(len)
test_paragraph_df['char_count'] = test_paragraph_df['paragraph_text'].apply(len)

# 단어 수 피처
train_paragraph_df['word_count'] = train_paragraph_df['paragraph_text'].apply(lambda x: len(x.split()))
test_paragraph_df['word_count'] = test_paragraph_df['paragraph_text'].apply(lambda x: len(x.split()))

# 문장 수 피처 (간단히 마침표, 느낌표, 물음표로 분리)
def count_sentences(text):
    return text.count('.') + text.count('!') + text.count('?')

train_paragraph_df['sentence_count'] = train_paragraph_df['paragraph_text'].apply(count_sentences)
test_paragraph_df['sentence_count'] = test_paragraph_df['paragraph_text'].apply(count_sentences)

# 문단 위치 (절대 index / 상대 비율)
def add_position_features(df):
    position_info = []
    for title, group in df.groupby('title'):
        total_paragraphs = group.shape[0]
        for idx in group['paragraph_index']:
            rel_pos = idx / (total_paragraphs - 1) if total_paragraphs > 1 else 0
            position_info.append((idx, rel_pos))
    abs_idx, rel_idx = zip(*position_info)
    df['absolute_position'] = abs_idx
    df['relative_position'] = rel_idx
    return df

train_paragraph_df = add_position_features(train_paragraph_df)
test_paragraph_df = add_position_features(test_paragraph_df)

# 최종 저장
train_paragraph_df.to_csv(BASE_DIR + 'data/train_paragraph_with_features.csv', index=False, encoding='utf-8')
test_paragraph_df.to_csv(BASE_DIR + 'data/test_paragraph_with_features.csv', index=False, encoding='utf-8')

print("버전1 Feature Engineering 저장 완료!")

✅ 버전1 Feature Engineering 저장 완료!


In [None]:
# TF-IDF Vectorizer 학습 (train + test 전체로)
all_texts = pd.concat([train_paragraph_df['paragraph_text'], test_paragraph_df['paragraph_text']]).tolist()
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
vectorizer.fit(all_texts)

# TF-IDF 통계 피처 추가 함수
def get_tfidf_stats(texts):
    tfidf_matrix = vectorizer.transform(texts)
    mean_tfidf = tfidf_matrix.mean(axis=1)
    max_tfidf = tfidf_matrix.max(axis=1).toarray().flatten()
    return np.array(mean_tfidf).flatten(), max_tfidf

# Train
mean_tfidf_train, max_tfidf_train = get_tfidf_stats(train_paragraph_df['paragraph_text'].tolist())
train_paragraph_df['tfidf_mean'] = mean_tfidf_train
train_paragraph_df['tfidf_max'] = max_tfidf_train

# Test
mean_tfidf_test, max_tfidf_test = get_tfidf_stats(test_paragraph_df['paragraph_text'].tolist())
test_paragraph_df['tfidf_mean'] = mean_tfidf_test
test_paragraph_df['tfidf_max'] = max_tfidf_test

# 최종 저장
train_paragraph_df.to_csv(BASE_DIR + 'data/train_paragraph_with_features_v2.csv', index=False, encoding='utf-8')
test_paragraph_df.to_csv(BASE_DIR + 'data/test_paragraph_with_features_v2.csv', index=False, encoding='utf-8')

print("버전2 Feature Engineering (TF-IDF 포함) 저장 완료!")

KeyboardInterrupt: 

In [None]:
# SBERT 임베딩 불러오기
X_sbert_train = np.load(BASE_DIR + 'data/embeddings/train_sbert_embeddings.npy')
X_sbert_test = np.load(BASE_DIR + 'data/embeddings/test_sbert_embeddings.npy')

# 버전1과 버전2 CSV 불러오기
v1_train_df = pd.read_csv(BASE_DIR + 'data/train_paragraph_with_features.csv')
v2_train_df = pd.read_csv(BASE_DIR + 'data/train_paragraph_with_features_v2.csv')

v1_test_df = pd.read_csv(BASE_DIR + 'data/test_paragraph_with_features.csv')
v2_test_df = pd.read_csv(BASE_DIR + 'data/test_paragraph_with_features_v2.csv')

# 버전1과 버전2 CSV 병합 (id 또는 index 맞게 조인)
train_df = pd.concat([v1_train_df, v2_train_df[['tfidf_mean', 'tfidf_max']]], axis=1)
test_df = pd.concat([v1_test_df, v2_test_df[['tfidf_mean', 'tfidf_max']]], axis=1)

# tf-idf + 특징 feature 컬럼 정의
feature_cols = [
    'char_count',
    'word_count',
    'sentence_count',
    'absolute_position',
    'relative_position',
    'tfidf_mean',
    'tfidf_max'
]

# numpy 변환
X_feat_train = train_df[feature_cols].values
X_feat_test = test_df[feature_cols].values





####기존버전과 다름 사용시 고쳐야함
# 문단 간 관계 피처 계산
avg_train, max_train, cos_mean_train, cos_max_train = get_title_agg_sbert_features(X_sbert_train, train_df)
avg_test, max_test, cos_mean_test, cos_max_test = get_title_agg_sbert_features(X_sbert_test, test_df)

# 최종 Concat: [SBERT, title 평균, title 최대, TF-IDF 등 특징 피처, cosine mean/max]
X_train = np.concatenate([X_sbert_train, avg_train, max_train, X_feat_train, cos_mean_train, cos_max_train], axis=1)
X_test = np.concatenate([X_sbert_test, avg_test, max_test, X_feat_test, cos_mean_test, cos_max_test], axis=1)

print(f"✅ 문단 간 관계 피처 포함 최종 X_train shape: {X_train.shape}")
print(f"✅ 문단 간 관계 피처 포함 최종 X_test shape: {X_test.shape}")

# 저장
output_dir = BASE_DIR + 'data/embeddings/'
os.makedirs(output_dir, exist_ok=True)

np.save(output_dir + 'train_concat_1.npy', X_train)
np.save(output_dir + 'test_concat_1.npy', X_test)

print("최종 3피처 concat 버전 저장 완료!")