In [3]:
import os
import glob
import MeCab
import ipadic
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [4]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_livedoor_corpus(corpus_dir):

	texts = []
	labels = []
	dir_count = 0
	file_count = 0

	for label in os.listdir(corpus_dir):

		if os.path.isdir(os.path.join(corpus_dir, label)):
			dir_count = dir_count + 1 

			for file_path in glob.glob(os.path.join(corpus_dir, label, "*.txt")):

				if os.path.basename(file_path) != "LICENSE.txt":
					file_count = file_count + 1

					with open(file_path, 'r', encoding='utf-8') as f:
						content = f.read().splitlines()[2:]       # 最初の 2 行はメタデータなのでスキップ
						text = "".join(content)
						text = text.translate(str.maketrans({"\n":"", "\t":"", "\r":"", "\u3000":""}))      # 特殊文字を除去
						texts.append(text)
						labels.append(label)

	print(f"dir_count = {dir_count}")
	print(f"file_count = {file_count}")

	return texts, labels

# ---------------------------------------------------------
# 形態素解析
# ---------------------------------------------------------
def preprocess_text(text):
	mecab = MeCab.Tagger("-Owakati")
	return mecab.parse(text).strip()

In [5]:
# ---------------------------------------------------------
# メイン
# ---------------------------------------------------------
texts, labels = load_livedoor_corpus('./text')

# テキストを前処理
texts = [preprocess_text(text) for text in texts]

# データをデータフレームに変換
df = pd.DataFrame({'text': texts, 'label': labels})

# データを訓練データとテストデータに分割
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=1)

# TF-IDF と ロジスティック回帰分類器のパイプラインを作成
pipeline = Pipeline([
	('tfidf', TfidfVectorizer(max_df=0.75, ngram_range=(1, 1))),        # TfidfVectorizer で特徴量を生成
	('clf', LogisticRegression(max_iter=3000))                          # LogisticRegression で学習
])

# モデルの訓練
pipeline.fit(x_train, y_train)

# テストデータを用いた予測
y_pred = pipeline.predict(x_test)

# モデルの評価
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(metrics.classification_report(y_test, y_pred))

# 混同行列の表示
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

dir_count = 9
file_count = 7367
Accuracy: 0.9104
                precision    recall  f1-score   support

dokujo-tsushin       0.86      0.94      0.90       160
  it-life-hack       0.94      0.89      0.91       169
 kaden-channel       0.94      0.90      0.92       164
livedoor-homme       0.96      0.70      0.81       122
   movie-enter       0.89      0.97      0.93       173
        peachy       0.85      0.85      0.85       174
          smax       0.96      0.98      0.97       163
  sports-watch       0.92      0.98      0.95       197
    topic-news       0.91      0.91      0.91       152

      accuracy                           0.91      1474
     macro avg       0.91      0.90      0.91      1474
  weighted avg       0.91      0.91      0.91      1474

Confusion Matrix:
[[151   0   0   0   2   7   0   0   0]
 [  3 150   5   0   1   0   3   3   4]
 [  1   3 148   1   4   2   0   0   5]
 [  3   5   5  85   5  13   1   4   1]
 [  1   0   0   0 168   3   0   1   0]
 [ 15  

In [6]:
# ---------------------------------------------------------
# モデルの保存
# ---------------------------------------------------------
joblib.dump(pipeline, 'text_classification_model.pkl')

['text_classification_model.pkl']