## フーリエ変換を用いたテキストのクラス分類器を実装する

In [1]:
import os
import pandas as pd
import numpy as np
from typing import Dict, List
import importlib
from pathlib import Path
import matplotlib.pyplot as plt
# import MeCab
from datasets import load_dataset
import pickle
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from nltk.corpus import stopwords
import nltk

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_PATH = Path.cwd().parent
DATA_PATH = BASE_PATH / "data"
MODEL_PATH = BASE_PATH / "model"
print(f"BASE_PATH: {BASE_PATH}")
print(f"DATA_PATH: {DATA_PATH}")
print(f"MODEL_PATH: {MODEL_PATH}")

BASE_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier
DATA_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier\data
MODEL_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier\model


In [3]:
import python.util as util

importlib.reload(util)
# ? logger読み込み
name = "15-Ensemble"
logger = util.set_logger()
# ? seed値固定
seed = 42
util.set_seed(seed)

2024-06-22 16:00:53,404 : python.util : INFO : 34 : Test_message


### DATASETS
今回は<a href="https://huggingface.co/datasets/stanfordnlp/imdb">Large Movie Review Dataset</a>を用いる．

In [4]:
#INFO: データ読み込み
#DATA_PATHが空のフォルダである場合は、データのダウンロードを行う
with util.timer("Read_data"):
    if os.listdir(DATA_PATH) == []:
        logger.info("No files in data folder")
        ds = load_dataset("stanfordnlp/imdb")
    else:
        logger.info("Files exist in data folder")
        with open(DATA_PATH / "train.pkl", "rb") as f:
            train = pickle.load(f)
        with open(DATA_PATH / "test.pkl", "rb") as f:
            test = pickle.load(f)
        ds = {"train": train, "test": test}

    #INFO: データの分割
    train = ds["train"]
    test = ds["test"]

    #INFO: データの保存
    with open(DATA_PATH / "train.pkl", "wb") as f:
        pickle.dump(train, f)
    with open(DATA_PATH / "test.pkl", "wb") as f:
        pickle.dump(test, f)

2024-06-22 16:00:53,435 : python.util : INFO : 8 : Files exist in data folder


[Read_data] start
[Read_data] done in 0.02 s


In [9]:
import re
nltk.download('stopwords')
def dataclean(sentence):
    # stopwords = set(stopwords.words('english'))
    # 1. 記号の削除
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    # 2. 小文字化
    sentence = sentence.lower()
    # 3. トークン化
    sentence = nltk.word_tokenize(sentence)
    # 4. stopwordsの削除
    sentence = [word for word in sentence if not word in set(stopwords.words("english"))]
    # 5. レマタイズ
    lemma = nltk.WordNetLemmatizer()
    sentence = [lemma.lemmatize(word) for word in sentence]
    # 結合
    sentence = " ".join(sentence)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zigza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
#INFO: 前処理
for i in range(len(train["text"])):
    train["text"][i] = dataclean(train["text"][i])
for i in range(len(test["text"])):
    test["text"][i] = dataclean(test["text"][i])

#INFO: 前処理後のデータを比較
logger.info(f"Train Before: {ds['train']['text'][0]}")
logger.info(f"Train After: {train['text'][0]}")
logger.info(f"Test Before: {ds['test']['text'][0]}")
logger.info(f"Test After: {test['text'][0]}")

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\zigza/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\zigza\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


### フーリエ変換を用いたテキストエンコーディング