## フーリエ変換を用いたテキストのクラス分類器を実装する

In [1]:
import os
import pandas as pd
import numpy as np
from typing import Dict, List
import importlib
from pathlib import Path
import matplotlib.pyplot as plt

# import MeCab
from datasets import load_dataset
import pickle
import skfuzzy as fuzz
from skfuzzy import control as ctrl
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.fft import fft
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_PATH = Path.cwd().parent
DATA_PATH = BASE_PATH / "data"
MODEL_PATH = BASE_PATH / "model"
print(f"BASE_PATH: {BASE_PATH}")
print(f"DATA_PATH: {DATA_PATH}")
print(f"MODEL_PATH: {MODEL_PATH}")

BASE_PATH: /home/masa1357/Dockerdata/gitfile/Fourier-TextClassifier
DATA_PATH: /home/masa1357/Dockerdata/gitfile/Fourier-TextClassifier/data
MODEL_PATH: /home/masa1357/Dockerdata/gitfile/Fourier-TextClassifier/model


In [3]:
import python.util as util

importlib.reload(util)
# ? logger読み込み
name = "15-Ensemble"
logger = util.set_logger()
# ? seed値固定
seed = 42
util.set_seed(seed)

2024-07-08 17:23:44,007 : python.util : INFO : 34 : Test_message


### DATASETS
今回は<a href="https://huggingface.co/datasets/stanfordnlp/imdb">Large Movie Review Dataset</a>を用いる．

In [4]:
DATA_PATH

PosixPath('/home/masa1357/Dockerdata/gitfile/Fourier-TextClassifier/data')

In [5]:
print(DATA_PATH / "train.pkl")

/home/masa1357/Dockerdata/gitfile/Fourier-TextClassifier/data/train.pkl


In [6]:
# INFO: データ読み込み
# pklの読み込み
from datasets import load_dataset

ds = load_dataset(
    "stanfordnlp/imdb"
)

In [7]:
train = ds["train"]
test = ds["test"]

In [8]:
import nltk

nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
import pandas as pd
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# データセットの読み込み（例としてCSVファイルから読み込む場合）
# train_df = pd.read_csv('path_to_your_dataset.csv')

# 例としてデータフレームを作成
# data = {"text": ["I love this movie!", "This is a terrible product."], "label": [1, 0]}
# train_df = pd.DataFrame(data)

# nltkのリソースをダウンロード（初回のみ必要）
nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("sentiwordnet")
nltk.download("wordnet")


# テキストの解析と品詞タグ付けを行う関数
def pos_tagging(text):
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    return pos_tags


# 品詞タグをWordNetの形式に変換する関数
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return None


# 単語の感情スコアを取得する関数
def get_sentiment_score(word, pos):
    wn_pos = get_wordnet_pos(pos)
    if wn_pos is not None:
        synsets = list(swn.senti_synsets(word, wn_pos))
        if synsets:
            # 最初のシノニムセットを使用して感情スコアを計算
            swn_synset = synsets[0]
            return swn_synset.pos_score(), swn_synset.neg_score()
    return 0, 0


# 各テキストの単語に対して感情スコアを取得する関数
def sentiment_analysis(pos_tags):
    sentiment_scores = []
    for word, pos in pos_tags:
        pos_score, neg_score = get_sentiment_score(word, pos)
        sentiment_scores.append((word, pos, pos_score, neg_score))
    return sentiment_scores


# 時系列直観的ファジー値（TIFS）を計算する関数
def calculate_tifs(sentiment_scores):
    tifs = []
    for i, (word, pos, pos_score, neg_score) in enumerate(sentiment_scores):
        tifs.append((word, pos, pos_score, neg_score, i, 1 - (pos_score + neg_score)))
    return tifs


# FFTを適用する関数
def apply_fft(tifs):
    membership_values = [x[2] for x in tifs]  # メンバーシップ度（ポジティブスコア）
    non_membership_values = [
        x[3] for x in tifs
    ]  # 非メンバーシップ度（ネガティブスコア）

    # メンバーシップ値と非メンバーシップ値に対してFFTを適用
    fft_membership = np.fft.fft(membership_values)
    fft_non_membership = np.fft.fft(non_membership_values)

    return fft_membership, fft_non_membership


# 実数部分とパワー項を計算する関数
def calculate_real_and_power(fft_results):
    fft_membership, fft_non_membership = fft_results

    # 実数部分を抽出
    real_membership = np.real(fft_membership)
    real_non_membership = np.real(fft_non_membership)

    # パワー項を計算
    power_membership = np.abs(fft_membership) ** 2
    power_non_membership = np.abs(fft_non_membership) ** 2

    # 合計パワー項を計算
    total_power_membership = np.sum(power_membership)
    total_power_non_membership = np.sum(power_non_membership)

    return (
        real_membership,
        real_non_membership,
        total_power_membership,
        total_power_non_membership,
    )


# 感情を分類する関数
def classify_sentiment(real_and_power):
    (
        real_membership,
        real_non_membership,
        total_power_membership,
        total_power_non_membership,
    ) = real_and_power

    # 角周波数振幅を使用して分類
    angular_frequency_membership = np.abs(real_membership)
    angular_frequency_non_membership = np.abs(real_non_membership)

    # パワー項の合計と角周波数振幅を基に感情を分類
    if (angular_frequency_membership[1] > angular_frequency_non_membership[1]) or (
        total_power_membership > total_power_non_membership
    ):
        return 1  # ポジティブ
    else:
        return 0  # ネガティブ


# データフレームに品詞タグ付け、感情スコア、およびTIFSの結果を追加
train_df["pos_tags"] = train_df["text"].apply(pos_tagging)
train_df["sentiment_scores"] = train_df["pos_tags"].apply(sentiment_analysis)
train_df["tifs"] = train_df["sentiment_scores"].apply(calculate_tifs)

# FFTの結果をデータフレームに追加
train_df["fft_results"] = train_df["tifs"].apply(apply_fft)

# 実数部分とパワー項をデータフレームに追加
train_df["real_and_power"] = train_df["fft_results"].apply(calculate_real_and_power)

# 感情を分類
train_df["predicted_label"] = train_df["real_and_power"].apply(classify_sentiment)

# 精度とF1スコアを計算
accuracy = accuracy_score(train_df["label"], train_df["predicted_label"])
f1 = f1_score(train_df["label"], train_df["predicted_label"])

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


NameError: name 'train_df' is not defined