In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from tensorflow.keras.datasets import reuters

# 단어장 개수별 ML 모델 성능 비교 (Accuracy / F1-score)

## 1. 데이터 로드 & 디코딩

In [66]:
(x_train_idx, y_train), (x_test_idx, y_test) = reuters.load_data(num_words=None, test_split=0.2)
word_index = reuters.get_word_index(path='reuters_word_index.json')
index_to_word = {index+3: word for word, index in word_index.items()}
for index, token in enumerate(('<pad>', '<sos>', '<unk>')):
  index_to_word[index]=token

In [67]:
# x_train/x_test 디코딩
x_train = [' '.join([index_to_word.get(i, '?') for i in seq]) for seq in x_train_idx]
x_test  = [' '.join([index_to_word.get(i, '?') for i in seq]) for seq in x_test_idx]

## 2. ML모델 및 실험할 vocab_size 정의

In [68]:
# 모델 정의
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVM': SVC(probability=True),
    'RandomForest': RandomForestClassifier(),
    'XGBoost': GradientBoostingClassifier(),
    'NaiveBayes': ComplementNB(),
    'KNN': KNeighborsClassifier(),
    'LightGBM': LGBMClassifier(verbose=-1),
    'DecisionTree': DecisionTreeClassifier()
}

# 단어 수 정의
vocab_sizes = [10000, 5000, None]  # None = all words

# 결과 저장 DataFrame 정의
columns = ['Vocabulary Size', 'Model', 'Accuracy', 'F1-Score']
df_results = pd.DataFrame(columns=columns)

## 3.훈련 및 평가

In [69]:
results_list = []

for vocab in vocab_sizes:
    tfidf_vectorizer = TfidfVectorizer(max_features=vocab)
    X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
    X_test_tfidf = tfidf_vectorizer.transform(x_test)

    if vocab is not None:
        print(f'{"*"*8}Vocab Size = {vocab}{"*"*8}')
    else:
        print(f'{"*"*8}All words{"*"*8}')
    
    for model_name, model in models.items():
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # 평가 결과 출력
        print(f'{model_name}: [Accuracy] = {acc} / [F1_score] = {f1}')
        
        # 평가 결과 dict 형태로 저장
        results_list.append({
            'Vocabulary Size': vocab if vocab is not None else 'All words',
            'Model': model_name,
            'Accuracy': acc,
            'F1-Score': f1
        })

# 반복 후 한 번에 DataFrame 생성
df_results = pd.DataFrame(results_list)

********Vocab Size = 10000********
LogisticRegression: [Accuracy] = 0.7969723953695459 / [F1_score] = 0.7742748415216105
SVM: [Accuracy] = 0.8063223508459484 / [F1_score] = 0.7884500873330293
RandomForest: [Accuracy] = 0.748886910062333 / [F1_score] = 0.7245028630808907
XGBoost: [Accuracy] = 0.7702582368655387 / [F1_score] = 0.7667588606246989
NaiveBayes: [Accuracy] = 0.7724844167408726 / [F1_score] = 0.7472902633131123
KNN: [Accuracy] = 0.7867319679430098 / [F1_score] = 0.7789670680335509




LightGBM: [Accuracy] = 0.21861086375779162 / [F1_score] = 0.2632495484476679
DecisionTree: [Accuracy] = 0.6985752448797863 / [F1_score] = 0.6938593680755918
********Vocab Size = 5000********
LogisticRegression: [Accuracy] = 0.798753339269813 / [F1_score] = 0.7765773185291976
SVM: [Accuracy] = 0.8081032947462155 / [F1_score] = 0.7915424120120957
RandomForest: [Accuracy] = 0.7666963490650045 / [F1_score] = 0.745721113898975
XGBoost: [Accuracy] = 0.7658058771148709 / [F1_score] = 0.761096816674096
NaiveBayes: [Accuracy] = 0.7689225289403384 / [F1_score] = 0.7432270502020317
KNN: [Accuracy] = 0.7894033837934105 / [F1_score] = 0.7802436439764467




LightGBM: [Accuracy] = 0.20525378450578807 / [F1_score] = 0.19645327370471216
DecisionTree: [Accuracy] = 0.6918967052537845 / [F1_score] = 0.6872725838522098
********All words********
LogisticRegression: [Accuracy] = 0.7916295636687445 / [F1_score] = 0.7670211296471304
SVM: [Accuracy] = 0.7996438112199465 / [F1_score] = 0.7806611171251367
RandomForest: [Accuracy] = 0.7395369545859305 / [F1_score] = 0.7125347080338018
XGBoost: [Accuracy] = 0.7604630454140695 / [F1_score] = 0.7586929131324669
NaiveBayes: [Accuracy] = 0.7649154051647373 / [F1_score] = 0.7346534179503126
KNN: [Accuracy] = 0.7720391807658059 / [F1_score] = 0.76393321267862




LightGBM: [Accuracy] = 0.1923419412288513 / [F1_score] = 0.1781183849277486
DecisionTree: [Accuracy] = 0.7030276046304541 / [F1_score] = 0.69793005166038


In [70]:
df_results.sort_values(by=['F1-Score', 'Accuracy'], ascending=False)

Unnamed: 0,Vocabulary Size,Model,Accuracy,F1-Score
9,5000,SVM,0.808103,0.791542
1,10000,SVM,0.806322,0.78845
17,All words,SVM,0.799644,0.780661
13,5000,KNN,0.789403,0.780244
5,10000,KNN,0.786732,0.778967
8,5000,LogisticRegression,0.798753,0.776577
0,10000,LogisticRegression,0.796972,0.774275
16,All words,LogisticRegression,0.79163,0.767021
3,10000,XGBoost,0.770258,0.766759
21,All words,KNN,0.772039,0.763933


# 벡터화 방법별 ML/DL 모델 성능 비교 (Accuracy / F1-score)

In [73]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding, Flatten
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.datasets import reuters
from sklearn.feature_extraction.text import TfidfVectorizer

## 1. 인풋 데이터 준비

In [74]:
# TF-IDF 벡터화
tfidf5000_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf5000_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf5000_vectorizer.transform(x_test)

# W2V 방식
x_train_tokenized = [sentence.split() for sentence in x_train]
x_test_tokenized = [sentence.split() for sentence in x_test]

model = Word2Vec(sentences = x_train_tokenized, vector_size = 512, window = 5, min_count = 5, workers = 4, sg = 0)
model_result = model.wv.most_similar('man')
print(model_result)

[('stangeland', 0.8456367254257202), ('bow', 0.8367922306060791), ('lai', 0.8266149759292603), ('read', 0.8237302303314209), ('glenn', 0.8221088647842407), ('iowa', 0.8213066458702087), ('myers', 0.820145845413208), ('sydney', 0.8192616701126099), ('cooperative', 0.8173328042030334), ('missouri', 0.8158978819847107)]


In [76]:
# 학습된 Word2Vec 모델
w2v_model = model

# 각 문장을 벡터화 시키는 코드
def vectorize_sentence(sentence, model, max_len):
    vecs = []
    for word in sentence:
        if word in model.wv:
            vecs.append(model.wv[word])
        else:
            vecs.append(np.zeros(model.vector_size))
    # Padding
    if len(vecs) < max_len:
        vecs += [np.zeros(model.vector_size)] * (max_len - len(vecs))
    else:
        vecs = vecs[:max_len]
    return np.array(vecs)

x_train_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_train_tokenized])
x_test_w2v = np.array([vectorize_sentence(s, w2v_model, max_len=100) for s in x_test_tokenized])

num_classes = len(np.unique(y_train))

In [78]:
x_train_w2v.shape

(8982, 100, 512)

## 3. 훈련 및 평가

In [79]:
results_list = []

In [84]:
# 1️. SVM
# TF-IDF
svm_tfidf = SVC(probability=True)
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred = svm_tfidf.predict(X_test_tfidf)
results_list.append({
    'Vectorization': 'TF-IDF',
    'Model': 'SVM',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})

# Word2Vec (문장 평균 벡터)
def avg_vector(sentence_vec):
    return np.mean(sentence_vec, axis=0)

X_train_w2v_avg = np.array([avg_vector(s) for s in x_train_w2v])
X_test_w2v_avg = np.array([avg_vector(s) for s in x_test_w2v])

svm_w2v = SVC(probability=True)
svm_w2v.fit(X_train_w2v_avg, y_train)
y_pred = svm_w2v.predict(X_test_w2v_avg)
results_list.append({
    'Vectorization': 'Word2Vec',
    'Model': 'SVM',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})

In [88]:
# 2️. Dense NN
# TF-IDF
dense_tfidf = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_train)), activation='softmax')
])
dense_tfidf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_tfidf.fit(X_train_tfidf.toarray(), y_train, epochs=20, batch_size=32, verbose=1)
y_pred = np.argmax(dense_tfidf.predict(X_test_tfidf.toarray()), axis=1)
results_list.append({
    'Vectorization': 'TF-IDF',
    'Model': 'Dense NN',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})

# Word2Vec
dense_w2v = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_w2v_avg.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_train)), activation='softmax')
])
dense_w2v.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
dense_w2v.fit(X_train_w2v_avg, y_train, epochs=20, batch_size=32, verbose=1)
y_pred = np.argmax(dense_w2v.predict(X_test_w2v_avg), axis=1)
results_list.append({
    'Vectorization': 'Word2Vec',
    'Model': 'Dense NN',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [90]:
# 3️. RNN
# TF-IDF 기반 RNN
# 단어 인덱스로 변환
tokenizer = tfidf_vectorizer  # 이미 fit된 TF-IDF vectorizer 사용
X_train_seq_tfidf = X_train_tfidf.toarray()
X_test_seq_tfidf = X_test_tfidf.toarray()

rnn_tfidf = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_seq_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_train)), activation='softmax')
])
rnn_tfidf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_tfidf.fit(X_train_seq_tfidf, y_train, epochs=20, batch_size=32, verbose=1)
y_pred = np.argmax(rnn_tfidf.predict(X_test_seq_tfidf), axis=1)
results_list.append({
    'Vectorization': 'TF-IDF',
    'Model': 'RNN',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})

# Word2Vec 기반 RNN
# 이미 x_train_w2v / x_test_w2v 준비됨 (shape: [num_samples, max_len, embedding_dim])
rnn_w2v = Sequential([
    LSTM(512, input_shape=(x_train_w2v.shape[1], x_train_w2v.shape[2])),
    Dropout(0.3),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_train)), activation='softmax')
])
rnn_w2v.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_w2v.fit(x_train_w2v, y_train, epochs=20, batch_size=32, verbose=1)
y_pred = np.argmax(rnn_w2v.predict(x_test_w2v), axis=1)
results_list.append({
    'Vectorization': 'Word2Vec',
    'Model': 'RNN',
    'Accuracy': accuracy_score(y_test, y_pred),
    'F1-Score': f1_score(y_test, y_pred, average='weighted')
})



In [91]:
df_results = pd.DataFrame(results_list)

In [92]:
df_results

Unnamed: 0,Vectorization,Model,Accuracy,F1-Score
0,TF-IDF,SVM,0.808103,0.791542
1,Word2Vec,SVM,0.729742,0.691754
2,TF-IDF,Dense NN,0.812556,0.80539
3,Word2Vec,Dense NN,0.726625,0.690898
4,TF-IDF,Dense NN,0.807658,0.801577
5,Word2Vec,Dense NN,0.750668,0.726468
6,TF-IDF,Dense NN,0.807658,0.801239
7,Word2Vec,Dense NN,0.757792,0.737604
8,TF-IDF,RNN,0.809884,0.803681
9,Word2Vec,RNN,0.78317,0.778113
