## フーリエ変換を用いたテキストのクラス分類器を実装する

In [16]:
import os
import pandas as pd
import numpy as np
from typing import Dict, List
import importlib
from pathlib import Path
import matplotlib.pyplot as plt
# import MeCab
from datasets import load_dataset
import pickle
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.fft import fft

In [17]:
BASE_PATH = Path.cwd().parent
DATA_PATH = BASE_PATH / "data"
MODEL_PATH = BASE_PATH / "model"
print(f"BASE_PATH: {BASE_PATH}")
print(f"DATA_PATH: {DATA_PATH}")
print(f"MODEL_PATH: {MODEL_PATH}")

BASE_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier
DATA_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier\data
MODEL_PATH: c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier\model


In [18]:
import python.util as util

importlib.reload(util)
# ? logger読み込み
name = "15-Ensemble"
logger = util.set_logger()
# ? seed値固定
seed = 42
util.set_seed(seed)

2024-07-02 00:22:22,010 : python.util : INFO : 34 : Test_message


### DATASETS
今回は<a href="https://huggingface.co/datasets/stanfordnlp/imdb">Large Movie Review Dataset</a>を用いる．

In [19]:
DATA_PATH

WindowsPath('c:/Users/zigza/OneDrive/ドキュメント/git/Fourier-TextClassifier/data')

In [20]:
print(DATA_PATH / "train.pkl")

c:\Users\zigza\OneDrive\ドキュメント\git\Fourier-TextClassifier\data\train.pkl


In [21]:
# INFO: データ読み込み
# pklの読み込み
from datasets import load_dataset

ds = load_dataset(
    "stanfordnlp/imdb"
)

In [22]:
train = ds["train"]
test = ds["test"]

In [23]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zigza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [24]:
import re

nltk.download("stopwords")


def dataclean(sentence):
    # stopwords = set(stopwords.words('english'))

    # logger.info(f"before: {sentence}")
    # 1. 記号の削除
    sentence = re.sub("[^a-zA-Z]", " ", sentence)
    # 2. 小文字化
    sentence = sentence.lower()
    # 3. トークン化
    sentence = nltk.word_tokenize(sentence)
    # 4. stopwordsの削除
    sentence = [
        word for word in sentence if not word in set(stopwords.words("english"))
    ]
    # 5. レマタイズ
    lemma = nltk.WordNetLemmatizer()
    sentence = [lemma.lemmatize(word) for word in sentence]
    # 結合
    sentence = " ".join(sentence)
    # logger.info(f"after: {sentence}")

    return sentence

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\zigza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
#listをpd.DataFrameに変換
train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)
train_df.head()


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [26]:
#前処理
with util.timer("train前処理"):
    train_df['text'] = train_df['text'].apply(dataclean)

train_df.head()
train_df.to_csv(DATA_PATH / "train.csv", index=False)

[train前処理] start


KeyboardInterrupt: 

In [None]:
test_df["text"] = test_df["text"].apply(dataclean)
test_df.to_csv(DATA_PATH / "test.csv", index=False)

### フーリエ変換を用いたテキストエンコーディング

#### ベクトル化
今回はTF-IDFを用いる．

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df["text"]).toarray()
X_test = vectorizer.transform(test_df["text"]).toarray()

#### FFTの適用

In [None]:
fft_train = [fft(x) for x in X_train]
fft_test = [fft(x) for x in X_test]

#### TIFVの計算

In [None]:
def fuzzy_membership(value, thresholds):
    if value <= thresholds[0]:
        return 0.0
    elif value >= thresholds[1]:
        return 1.0
    else:
        return (value - thresholds[0]) / (thresholds[1] - thresholds[0])


def calculate_tifv(fft_results):
    tifv_results = []
    for fft_result in fft_results:
        tifv_values = []
        for value in fft_result:
            membership_real = fuzzy_membership(value.real, [0, 1])
            membership_imag = fuzzy_membership(value.imag, [0, 1])
            tifv = (membership_real + (1 - membership_imag)) / 2
            tifv_values.append(tifv)
        tifv_results.append(tifv_values)
    return tifv_results


tifv_train = calculate_tifv(fft_train)
tifv_test = calculate_tifv(fft_test)

#### ロジスティック回帰による分類

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# TIFVの平均値を特徴量として使用
X_train_tifv = np.array([np.mean(tifv) for tifv in tifv_train]).reshape(-1, 1)
X_test_tifv = np.array([np.mean(tifv) for tifv in tifv_test]).reshape(-1, 1)

y_train = train_df["label"]
y_test = test_df["label"]

# モデルのトレーニング
model = LogisticRegression()
model.fit(X_train_tifv, y_train)

# モデルの予測
y_pred = model.predict(X_test_tifv)

# 精度の評価
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

#### Test Code

In [None]:


# # サンプルテキスト
# texts = [
#     "I love this product!",
#     "This is the worst experience ever.",
#     "It's okay, not great but not terrible.",
# ]

# # トークン化とTF-IDFベクトル化
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(texts).toarray()

# # パディング
# max_length = max([len(x) for x in X])
# padded_X = np.array([np.pad(x, (0, max_length - len(x)), "constant") for x in X])

# # FFTの適用
# fft_results = [fft(x) for x in padded_X]


# # ファジィメンバーシップ関数の定義
# def fuzzy_membership(value, thresholds):
#     if value <= thresholds[0]:
#         return 0.0
#     elif value >= thresholds[1]:
#         return 1.0
#     else:
#         return (value - thresholds[0]) / (thresholds[1] - thresholds[0])


# # TIFVの計算
# tifv_results = []
# for fft_result in fft_results:
#     tifv_values = []
#     for value in fft_result:
#         membership_real = fuzzy_membership(value.real, [0, 1])
#         membership_imag = fuzzy_membership(value.imag, [0, 1])
#         tifv = (membership_real + (1 - membership_imag)) / 2
#         tifv_values.append(tifv)
#     tifv_results.append(tifv_values)


# # 感情分析
# def sentiment_classification(tifv_values):
#     mean_tifv = np.mean(tifv_values)
#     if mean_tifv > 0.5:
#         return "Positive"
#     elif mean_tifv < 0.5:
#         return "Negative"
#     else:
#         return "Neutral"


# sentiments = [sentiment_classification(tifv) for tifv in tifv_results]
# print(sentiments)

['Negative', 'Positive', 'Positive']
