In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np


In [4]:
from sklearn.model_selection import train_test_split

# USE_FULL_DATA = False일 때만 샘플링 수행
USE_FULL_DATA = False
SAMPLE_N = 200_000

if not USE_FULL_DATA and len(df) > SAMPLE_N:
    # stratify 샘플링
    df_small, _ = train_test_split(
        df,
        train_size=SAMPLE_N,
        random_state=42,
        stratify=df["label"],
    )
    df = df_small.reset_index(drop=True)
    print("층화 샘플링 후 크기:", df.shape)
else:
    print("전체 데이터 사용")


층화 샘플링 후 크기: (200000, 2)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

print("Train:", len(X_train), "Test:", len(X_test))


Train: 160000 Test: 40000


In [7]:
vectorizer = TfidfVectorizer(
    analyzer="char_wb",   # 단어 경계 기준 문자 n-gram
    ngram_range=(3, 5),   # 3~5 글자
    min_df=3,             # 너무 희귀한 n-gram 제거
    max_features=100_000  # 처음엔 10만, 나중에 여유되면 150k, 200k로 올려보기
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)


TF-IDF shape (train): (160000, 100000)
TF-IDF shape (test): (40000, 100000)


In [8]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)

y_pred_nb = nb.predict(X_test_tfidf)

print("=== Naive Bayes 결과 ===")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, digits=4))

cm_nb = confusion_matrix(y_test, y_pred_nb)
print("Confusion matrix (NB):")
print(cm_nb)


=== Naive Bayes 결과 ===
Accuracy: 0.998375
              precision    recall  f1-score   support

           0     0.9988    0.9980    0.9984     20372
           1     0.9980    0.9987    0.9983     19628

    accuracy                         0.9984     40000
   macro avg     0.9984    0.9984    0.9984     40000
weighted avg     0.9984    0.9984    0.9984     40000

Confusion matrix (NB):
[[20332    40]
 [   25 19603]]


In [9]:
# SVM은 느리니까 train 중 일부만 사용해서 먼저 감만 잡자
SVM_TRAIN_MAX = 100_000  # 이 숫자 늘리면 바로 시간 늘어남

X_train_svm = X_train_tfidf
y_train_svm = y_train.to_numpy()

if X_train_svm.shape[0] > SVM_TRAIN_MAX:
    idx = np.random.choice(X_train_svm.shape[0], size=SVM_TRAIN_MAX, replace=False)
    X_train_svm = X_train_svm[idx]
    y_train_svm = y_train_svm[idx]
    print(f"SVM 학습용으로 {SVM_TRAIN_MAX}개 샘플만 사용")
else:
    print("SVM 학습에 train 전체 사용")

svm = LinearSVC(
    C=1.0,
    max_iter=2000,   # 수렴 안 되면 더 늘릴 수 있음
)

svm.fit(X_train_svm, y_train_svm)

y_pred_svm = svm.predict(X_test_tfidf)

print("=== Linear SVM 결과 ===")
print("Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, digits=4))

cm_svm = confusion_matrix(y_test, y_pred_svm)
print("Confusion matrix (SVM):")
print(cm_svm)


SVM 학습용으로 100000개 샘플만 사용
=== Linear SVM 결과 ===
Accuracy: 0.999575
              precision    recall  f1-score   support

           0     0.9992    1.0000    0.9996     20372
           1     1.0000    0.9991    0.9996     19628

    accuracy                         0.9996     40000
   macro avg     0.9996    0.9996    0.9996     40000
weighted avg     0.9996    0.9996    0.9996     40000

Confusion matrix (SVM):
[[20372     0]
 [   17 19611]]
