In [4]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from tensorflow.keras.datasets import reuters
import seaborn as sns

def evaluate_models(num_words):
    # 로이터 뉴스 데이터 로딩
    (X_train, y_train), (X_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)

    # 데이터를 텍스트로 변환
    word_index = reuters.get_word_index(path="reuters_word_index.json")
    index_to_word = {index + 3: word for word, index in word_index.items()}
    for index in [0, 1, 2]:
        index_to_word[index] = ''

    X_train = [' '.join([index_to_word.get(i, '') for i in X_train[k]]) for k in range(len(X_train))]
    X_test = [' '.join([index_to_word.get(i, '') for i in X_test[k]]) for k in range(len(X_test))]

    # TF-IDF 변환
    vectorizer = sklearn.feature_extraction.text.TfidfVectorizer()
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    # 모델 정의
    models = [
        ('Naive Bayes', sklearn.naive_bayes.MultinomialNB()),
        ('CNB', sklearn.naive_bayes.ComplementNB()),
        ('Logistic Regression', sklearn.linear_model.LogisticRegression(max_iter=1000)),
        ('SVC', sklearn.calibration.CalibratedClassifierCV(sklearn.svm.LinearSVC())),
        ('Decision Tree', sklearn.tree.DecisionTreeClassifier()),
        ('Random Forest', sklearn.ensemble.RandomForestClassifier()),
        ('Gradient Boosting', sklearn.ensemble.GradientBoostingClassifier()),
        ('Voting', sklearn.ensemble.VotingClassifier(estimators=[
            ('lr', sklearn.linear_model.LogisticRegression(max_iter=1000)),
            ('rf', sklearn.ensemble.RandomForestClassifier()),
            ('svc', sklearn.calibration.CalibratedClassifierCV(sklearn.svm.LinearSVC())),
        ], voting='soft'))
    ]
    print('#1')

    # 모델 훈련 및 평가
    results = []
    for name, model in models:
        print('#2')
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1 = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
        cm = sklearn.metrics.confusion_matrix(y_test, y_pred)
        results.append((name, num_words, f1, cm))

    return results

# 다양한 num_words에 대한 모델 성능 평가
num_words_list = [None, 10000, 5000, 2500, 1250, 625]
all_results = []
for num_words in num_words_list:
    print('#3')
    results = evaluate_models(num_words)
    all_results.extend(results)

# 결과 출력
for name, num_words, f1, cm in all_results:
    print(f"Model: {name}, Num Words: {num_words}, F1-Score: {f1}")
    plt.figure(figsize=(10,7))
    sns.heatmap(cm, annot=True, fmt=".0f", square=True, cmap='Blues')
    plt.title(f"Confusion matrix of {name} model with num_words={num_words}")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

#3
#1
#2
#2
#2
#2
#2
#2
#2
#2
#3
#1
#2
#2
#2
#2
#2
#2
#2
#2
#3
#1
#2
#2
#2
#2
#2
#2
#2


KeyboardInterrupt: 