# 感情分析
多対一の多層RNNの実装

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd

2024-10-20 16:03:30.962037: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-20 16:03:30.983507: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-20 16:03:31.002315: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-20 16:03:31.007422: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-20 16:03:31.023707: I tensorflow/core/platform/cpu_feature_guar

## データの用意
IMDb(Internet Movie Database) の映画レビューデータセット
- `review`: 映画レビューのテキスト
- `sentiment`: 予測したい目的変数

In [2]:
df = pd.read_csv('movie_data.csv', encoding='utf-8')

## データ前処理
- TensorFlow の `DataSet`オブジェクトを作成し、train(20,000), test(25,000), validation(5,000) に分割する
- train データで一意な単語を洗い出す。
- 各単語を一意な整数と対応付け、レビューテキストを整数（一意な単語のインデックス）にエンコードする
- モデルへの入力としてデータセットをミニバッチに分割する

In [3]:
target = df.pop('sentiment')
ds_raw = tf.data.Dataset.from_tensor_slices((df.values, target.values))

# 中身の確認
for ex in ds_raw.take(3):
    tf.print(ex[0].numpy()[0][:50], ex[1]) # datasetからの取り出し方に注意する

b'In 1974, the teenager Martha Moxley (Maggie Grace)' 1
b'OK... so... I really like Kris Kristofferson and h' 0
b'***SPOILER*** Do not read this, if you think about' 0


I0000 00:00:1729407816.244031  165843 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-10-20 16:03:36.313041: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2024-10-20 16:03:36.374994: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [4]:
tf.random.set_seed(1)
ds_raw = ds_raw.shuffle(50000, reshuffle_each_iteration=False)
ds_raw_test = ds_raw.take(25000)
ds_raw_train_valid = ds_raw.skip(25000)
ds_raw_train = ds_raw_train_valid.take(20000)
ds_raw_valid = ds_raw_train_valid.skip(20000)

In [6]:
from collections import Counter
# import tensorflow_text as tftext

- `Counter`オブジェクトは一意のトークンの出現回数をカウントできる。
- `tensorflow_datasets`パッケージにテキストをトークンに分割する`Tokenizer`クラスが含まれている。

In [7]:
try:
    tokenizer = tfds.features.text.Tokenizer()
except AttributeError:
    tokenizer = tfds.deprecated.text.Tokenizer()
    
token_counts = Counter()

for example in ds_raw_train:
    tokens = tokenizer.tokenize(example[0].numpy()[0])
    token_counts.update(tokens)
    
print('Vocab-size:', len(token_counts))

Vocab-size: 87007


2024-10-20 16:04:04.467203: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [8]:
print(type(token_counts))

<class 'collections.Counter'>


ダメそうなので代替案

In [9]:
# def tokenize(text):
#     return tf.strings.split(text)

# def count_tokens(tokens):
#     # tf.unique_with_countsは、ユニークな要素とその出現回数を返す
#     unique_tokens, token_counts = tf.unique_with_counts(tokens)
#     return unique_tokens, token_counts

# # Datasetの処理
# def count_tokens_in_dataset(dataset):
#     token_counts = dataset.map(tokenize).flat_map(tf.data.Dataset.from_tensor_slices)
#     # 初期値を全て0のテンソルにする
#     initial_state = tf.zeros_like(token_counts.element_spec, dtype=tf.int64)
#     # 各要素の出現回数を足し合わせる
#     token_counts = token_counts.reduce(initial_state, lambda x, y: x + y)
#     return token_counts

# token_couots = count_tokens_in_dataset(ds_raw_train)

In [10]:
try:
    encoder = tfds.features.text.TokenTextEncoder(token_counts)
except AttributeError:
    encoder = tfds.deprecated.text.TokenTextEncoder(token_counts)

example_str = 'This is an example!'
encoder.encode(example_str)

[232, 9, 270, 1123]

In [11]:
## Step 3-A: define the function for transformation

def encode(text_tensor, label):
    text = text_tensor.numpy()[0]
    encoded_text = encoder.encode(text)
    return encoded_text, label

## Step 3-B: wrap the encode function to a TF Op.
def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], 
                          Tout=(tf.int64, tf.int64))

In [12]:
ds_train = ds_raw_train.map(encode_map_fn)
ds_valid = ds_raw_valid.map(encode_map_fn)
ds_test = ds_raw_test.map(encode_map_fn)

tf.random.set_seed(1)
for example in ds_train.shuffle(1000).take(5):
    print('Sequence length:', example[0].shape)
    

Sequence length: (24,)
Sequence length: (179,)
Sequence length: (262,)
Sequence length: (535,)
Sequence length: (130,)


In [13]:
## Take a small subset

ds_subset = ds_train.take(8)
for example in ds_subset:
    print('Individual size:', example[0].shape)

## batching the datasets
ds_batched = ds_subset.padded_batch(
    4, padded_shapes=([-1], []))

for batch in ds_batched:
    print('Batch dimension:', batch[0].shape)

Individual size: (119,)
Individual size: (688,)
Individual size: (308,)
Individual size: (204,)
Individual size: (326,)
Individual size: (240,)
Individual size: (127,)
Individual size: (453,)
Batch dimension: (4, 688)
Batch dimension: (4, 453)


2024-10-20 16:04:28.271175: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [14]:
## batching the datasets
train_data = ds_train.padded_batch(
    32, padded_shapes=([-1],[]))

valid_data = ds_valid.padded_batch(
    32, padded_shapes=([-1],[]))

test_data = ds_test.padded_batch(
    32, padded_shapes=([-1],[]))

### 埋め込み（embedding）
埋め込みとは表現学習の手法の1つであり、単語ベクトルの次元を削減するための前処理ステップである。

一意な単語の個数が $n_{words}$ であるとする。語彙全体を入力特徴量として表すには、埋め込みベクトルのサイズを一意の個数よりもかなり小さくすればよい。

In [15]:
from tensorflow.keras.layers import Embedding
model = tf.keras.Sequential()
model.add(Embedding(input_dim=100, output_dim=6,
                    name='embed-layer'))

model.summary()

In [16]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Embedding(1000, 32))
model.add(SimpleRNN(32, return_sequences=True))
model.add(SimpleRNN(32))
model.add(Dense(1))
model.summary()

In [17]:
## An example of building a RNN model
## with LSTM layer


from tensorflow.keras.layers import LSTM


model = Sequential()
model.add(Embedding(10000, 32))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(1))
model.summary()

In [18]:
embedding_dim = 20
vocab_size = len(token_counts) + 2

tf.random.set_seed(1)

## build the model
bi_lstm_model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        name='embed-layer'),
    
    tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, name='lstm-layer'),
        name='bidir-lstm'), 

    tf.keras.layers.Dense(64, activation='relu'),
    
    tf.keras.layers.Dense(1, activation='sigmoid')
])

bi_lstm_model.summary()

## compile and train:
bi_lstm_model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
    metrics=['accuracy'])

history = bi_lstm_model.fit(
    train_data, 
    validation_data=valid_data, 
    epochs=10)

## evaluate on the test data
test_results= bi_lstm_model.evaluate(test_data)
print('Test Acc.: {:.2f}%'.format(test_results[1]*100))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 146ms/step - accuracy: 0.6181 - loss: 0.6276 - val_accuracy: 0.6888 - val_loss: 0.5817
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 146ms/step - accuracy: 0.8265 - loss: 0.3970 - val_accuracy: 0.7556 - val_loss: 0.4997
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 147ms/step - accuracy: 0.9163 - loss: 0.2227 - val_accuracy: 0.8390 - val_loss: 0.3801
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 146ms/step - accuracy: 0.9632 - loss: 0.1105 - val_accuracy: 0.8228 - val_loss: 0.4922
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 146ms/step - accuracy: 0.9600 - loss: 0.1159 - val_accuracy: 0.6244 - val_loss: 0.6754
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 148ms/step - accuracy: 0.7050 - loss: 0.5508 - val_accuracy: 0.8426 - val_loss: 0.4432
Epoch 7/10