# Unified Pipeline — Baseline + Active Learning (TF-IDF + Logistic Regression)
이 노트북 하나에서 **베이스라인 선택**과 **Active Learning 라운드**를 모두 수행합니다.

**순서**
1) Baseline: stratified split → Calibration/τ/grid로 valid에서 베스트 선택
2) train+valid 재학습 → test 1회 평가 → artifacts 저장
3) Active Learning: 후보 추출 → 수동 라벨 CSV → 병합(골드 우선, 약라벨 다운샘플)
4) (선택) 배치 추론/chi² n-gram 내보내기



In [None]:
# !pip install -q scikit-learn pandas numpy joblib
import os, warnings, numpy as np, pandas as pd
warnings.filterwarnings('ignore')

from sentiment_utils import (
    clean, ensure_cols, load_dataset, split_train_valid_test,
    ClassicCfg, ClassicSentiment, apply_neutral_policy, small_grid_search,
    select_active_learning_candidates, merge_gold_and_weak,
    batch_predict, chi2_top_ngrams_from_df
)


In [None]:
class CFG:
    DATA_LABELED_BASE = 'comments_labeled.csv'        # 초기 라벨(약라벨 포함 가능)
    DATA_LABELED_MERGED = 'comments_labeled_merged.csv'  # 라운드 후 병합본(없으면 BASE 사용)
    LABELS = ['neg','neu','pos']
    TEST_SIZE = 0.1
    VALID_SIZE = 0.1
    RANDOM_STATE = 42
    ART_DIR = 'artifacts/classic'
    EXPORT_DIR = 'exports_ngrams'
    NEW_CSV = 'comments_new.csv'   # 후보 추출/배치 추론용

os.makedirs(CFG.ART_DIR, exist_ok=True)
os.makedirs(CFG.EXPORT_DIR, exist_ok=True)

# 학습용 입력 데이터 선택
DATA_FOR_TRAIN = CFG.DATA_LABELED_MERGED if os.path.exists(CFG.DATA_LABELED_MERGED) else CFG.DATA_LABELED_BASE
print('Using training dataset:', DATA_FOR_TRAIN)

## (1) Baseline — split → grid/Calibration/τ on valid

In [None]:
df = load_dataset(DATA_FOR_TRAIN, labels=CFG.LABELS)
train_df, valid_df, test_df = split_train_valid_test(df, CFG.TEST_SIZE, CFG.VALID_SIZE, CFG.RANDOM_STATE)
print('Sizes | train:', len(train_df), 'valid:', len(valid_df), 'test:', len(test_df))
print('Train dist:\n', train_df['label'].value_counts(normalize=True).sort_index())

train_texts = train_df['text'].tolist()
valid_texts = valid_df['text'].tolist()
test_texts  = test_df['text'].tolist()
y_train, y_valid, y_test = train_df['label'].values, valid_df['label'].values, test_df['label'].values

best_cfg, best_tau, best_score = small_grid_search(train_texts, y_train, valid_texts, y_valid)
print('Best on valid | macro-F1:', round(best_score, 4), '| cfg:', best_cfg, '| tau:', best_tau)

## (2) Retrain on train+valid → Test once → Save artifacts

In [None]:
tv_texts = pd.concat([train_df['text'], valid_df['text']]).tolist()
tv_labels = pd.concat([train_df['label'], valid_df['label']]).values

final_model = ClassicSentiment(best_cfg).fit(tv_texts, tv_labels)
proba_test, labels_test = final_model.predict_proba(test_texts)
pred_test = apply_neutral_policy(proba_test, labels_test, tau=best_tau, gap=0.05)

from sklearn.metrics import classification_report, confusion_matrix
print("=== Test Report (final) ===")
print(classification_report(y_test, pred_test, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, pred_test, labels=CFG.LABELS))

final_model.save(CFG.ART_DIR)
print('Saved artifacts to', CFG.ART_DIR)

## (3) Active Learning — Candidate selection → Manual CSV → Merge

In [None]:
#(3-1) 후보 추출: 저신뢰 p_max + k-means 다양성
cand = select_active_learning_candidates(CFG.NEW_CSV, CFG.ART_DIR, K=50, per_cluster=4, tau_for_low=0.6)
cand.to_csv('to_label_manual.csv', index=False, encoding='utf-8-sig')
cand.head()

In [None]:
# (3-2) 사람이 'to_label_manual.csv'에 label 채운 후, 아래 병합 실행
df_merged = merge_gold_and_weak(CFG.DATA_LABELED_BASE, 'manual_labeled_round1.csv', key='comment_id', weak_frac=0.4)
df_merged.to_csv(CFG.DATA_LABELED_MERGED, index=False, encoding='utf-8-sig')
print('Merged saved ->', CFG.DATA_LABELED_MERGED)
df_merged['label'].value_counts()

## (4) (선택) Batch inference & chi² n-gram exports

In [None]:
out_path = batch_predict(CFG.NEW_CSV, CFG.ART_DIR, tau=best_tau, gap=0.05)
print('Pred CSV ->', out_path)
chi2_top_ngrams_from_df(pd.read_csv(out_path, encoding='utf-8'), label_col='pred', export_dir=CFG.EXPORT_DIR)