In [None]:
import os
import glob
import MeCab
import ipadic
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn import metrics

In [None]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_livedoor_corpus(corpus_dir):

  texts = []
  labels = []

  for label in os.listdir(corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, label)): 
      for file_path in glob.glob(os.path.join(corpus_dir, label, "*.txt")):
        with open(file_path, 'r', encoding='utf-8') as file:         
          content = file.read().splitlines()[2:]            # 最初の 2 行はメタデータなのでスキップ
          text = '\n'.join(content)
          texts.append(text)
          labels.append(label)
  
  return texts, labels

# ---------------------------------------------------------
# 形態素解析
# ---------------------------------------------------------
def tokenize(text):
  tagger = MeCab.Tagger(f'-Owakati -d "{ipadic.DICDIR}" -u "./NEologD.20200910-u.dic"')
  result = tagger.parse(text)
  return result.strip()

# ---------------------------------------------------------
# メイン
# ---------------------------------------------------------

# ニュースの読み込み
texts, labels = load_livedoor_corpus('./livedoor_news')
text_token = []

for text in texts:
  text_token.append(tokenize(text))

# 学習データとテストデータに分割
df = pd.DataFrame({'text': text_token, 'label': labels})
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=1)

# パイプランの作成
#pipeline = Pipeline([
#  ('tfidf', TfidfVectorizer()),               # TfidfVectorizer で特徴量を生成
#  ('rfc', RandomForestClassifier())           # RandomForestClassifier で学習
#])

pipeline = Pipeline([
  ('tfidf', TfidfVectorizer()),
  ('clf', LogisticRegression())
])

# パラメーターの定義
param_grid = {
  'tfidf__max_df': [0.75],
  'tfidf__ngram_range': [(1, 1)],
  'clf__max_iter': [1000]
}

# モデルの学習
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)

# 最適モデルによる予測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# モデルの評価
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(metrics.classification_report(y_test, y_pred))

# 混同行列の表示
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# モデルの保存
joblib.dump(best_model, 'best_model.pkl')
print('Model saved to best_model.pkl')

In [None]:
# ---------------------------------------------------------
# データの読み込み
# ---------------------------------------------------------
def load_livedoor_corpus(corpus_dir):

  texts = []
  labels = []

  for label in os.listdir(corpus_dir):
    if os.path.isdir(os.path.join(corpus_dir, label)): 
      for file_path in glob.glob(os.path.join(corpus_dir, label, "*.txt")):
        with open(file_path, 'r', encoding='utf-8') as file:         
          content = file.read().splitlines()[2:]            # 最初の 2 行はメタデータなのでスキップ
          text = '\n'.join(content)
          texts.append(text)
          labels.append(label)
  
  return texts, labels

# ---------------------------------------------------------
# 形態素解析
# ---------------------------------------------------------
def tokenize(text):
  tagger = MeCab.Tagger(f'-Owakati -d "{ipadic.DICDIR}" -u "./NEologD.20200910-u.dic"')
  result = tagger.parse(text)
  return result.strip()

# ---------------------------------------------------------
# メイン
# ---------------------------------------------------------

# ニュースの読み込み
texts, labels = load_livedoor_corpus('./livedoor_news')
text_token = []

for text in texts:
  text_token.append(tokenize(text))

# 学習データとテストデータに分割
df = pd.DataFrame({'text': text_token, 'label': labels})
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=1)

# パイプランの作成
pipeline = Pipeline([
  ('tfidf', TfidfVectorizer()),               # TfidfVectorizer で特徴量を生成
  ('rfc', RandomForestClassifier())           # RandomForestClassifier で学習
])

pipeline = Pipeline([
  ('tfidf', TfidfVectorizer(max_df=0.75, ngram_range=(1, 1))),
  ('clf', LogisticRegression(max_iter=1000))
])

# パラメーターの定義
param_grid = {
  'tfidf__max_df': [0.75],
  'tfidf__ngram_range': [(1, 1)],
  'rfc__n_estimators': [100],
  'rfc__max_depth': [None],
  'rfc__max_iter': [1000]
}

# モデルの学習
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)

# 最適モデルによる予測
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)

# モデルの評価
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print(metrics.classification_report(y_test, y_pred))

# 混同行列の表示
conf_matrix = metrics.confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

# モデルの保存
joblib.dump(best_model, 'best_model.pkl')
print('Model saved to best_model.pkl')