# Fragrantica perfume review clasifier (LSTM with stopword removed)

In [1]:
import pandas as pd
import requests
from os import path
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
# Model Name
MODEL_NAME = 'lstm_with_stopword_removed_04'

# HyperParameters
PAD_LEN = 200
NUM_WORDS = 500
EMBEDDING = 100
BATCH_SIZE = 256


In [3]:
data_path = '../dataset/dataset_210626_215600.csv'
data_exist = path.exists(data_path)

if not data_exist:
    url = 'https://kyuuuw-nlp-dataset.s3.ap-northeast-2.amazonaws.com/fragrantica/dataset_210626_215600.csv'
    r = requests.get(url, allow_redirects=True)
    open(data_path, 'w').write(r.content)

In [4]:
data = pd.read_csv(data_path)

X_data = data['stopwords_removed']
y_data = data['label']

print(len(X_data))
print(len(y_data))


##### 토큰화 및 인덱스 부여

74779
74779


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)

print(X_data[0])
print(len(sequences[0]))

['got', 'sample', 'today', 'year', 'old', 'daughter', 'thought', 'smelling', 'sprayed', 'card', 'rotten', 'fish', 'nearly', 'threw', 'immediately', 'however', 'later', 'evening', 'decided', 'give', 'fair', 'trial', 'sprayed', 'crook', 'elbows', 'rotting', 'fish', 'smell', 'time', 'got', 'definite', 'bit', 'funk', 'almost', 'urine', 'scent', 'minutes', 'blossomed', 'gorgeous', 'smooth', 'woody', 'ambery', 'clean', 'warm', 'jasmine', 'daughter', 'didnt', 'even', 'believe', 'told', 'fragrance', 'smelled', 'earlier', 'lol', 'immediately', 'commented', 'fresh', 'clean', 'vibe', 'agree', 'soapy', 'hint', 'powdery', 'way', 'obsessed', 'cant', 'stop', 'smelling', 'arm', 'glad', 'gave', 'try', 'wait', 'buy', 'full', 'bottle']
76


##### 인덱스별 단어 표시

In [17]:
word_to_index = tokenizer.word_index


##### 빈도수 분석

In [7]:
threshold = 2
total_cnt = len(word_to_index) # 총 단어의 수
rare_cnt = 0 # 등장 빈도 수가 threshold 보다 작은 단어의 개수를 카운트
total_freq = 0 # 훈련 데이터의 전체 단어 빈도수 총 합
rare_freq = 0

for key, value in tokenizer.word_counts.items():
    total_freq = total_freq + value

    if value < threshold:
        rare_cnt = rare_cnt + 1
        rare_freq = rare_freq + value

print(f"등장 빈도가 {threshold}번 이하인 희귀 단어의 수: {rare_cnt}")
print(f"단어 집합(vocabulary)에서 희귀 단어의 비율: {rare_cnt / total_cnt}" )
print(f"전체 등장 빈도에서 희귀 단어 등장 빈도 비율: {rare_freq / total_freq}")

등장 빈도가 2번 이하인 희귀 단어의 수: 44912
단어 집합(vocabulary)에서 희귀 단어의 비율: 0.552307635549762
전체 등장 빈도에서 희귀 단어 등장 빈도 비율: 0.013189565348140857


##### 등장 횟수 상위 50000개의 word 만 사용

In [8]:
tokenizer = Tokenizer(num_words=NUM_WORDS)
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)

print(sequences[:5])

[[41, 102, 175, 207, 142, 134, 83, 145, 466, 105, 190, 474, 404, 111, 145, 5, 17, 41, 38, 77, 2, 184, 249, 355, 129, 120, 70, 155, 27, 273, 4, 45, 333, 466, 39, 120, 198, 330, 276, 170, 36, 343, 421, 83, 477, 324, 60, 43, 185, 12], [15, 310, 8, 1, 55, 1, 7, 496, 161, 16, 8, 57, 100, 67, 25, 161, 37, 16, 168, 16, 343, 494, 7, 47, 90, 12, 404, 111, 60, 145, 273, 428, 215, 115, 57, 7, 155, 35, 20, 350, 5, 5, 155, 70, 139, 442, 36, 20, 228, 52, 11, 459, 82, 73, 248, 273, 67, 444, 138, 4, 405, 22, 8, 100, 19, 33, 244, 213, 101, 16, 16, 252, 228, 22, 115, 46, 19, 12, 331, 6, 331, 187, 224, 43, 12, 236, 17, 310, 268, 11], [7, 132, 58, 385, 133, 7, 132, 47, 26, 9, 454, 41, 102, 375, 109, 85, 248, 234, 212, 4, 18, 155, 171, 416, 212], [340, 15, 155, 185, 15, 29, 5, 369, 279, 155, 459, 53, 17, 154, 117, 212, 119, 155, 399, 66, 132, 79, 14, 6, 1, 3, 19, 191, 210, 49, 45, 76, 163, 379, 38, 345, 164, 23, 15, 55, 226, 86, 86, 9, 50, 196, 88, 6, 417, 133, 212, 18, 105, 171, 416, 320, 5, 20, 1, 212, 8

##### Split data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(sequences, y_data)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

56084
56084
18695
18695


In [10]:
print(X_train[0])
print(y_train[0])

[399, 178, 90, 208, 137, 197, 2, 6, 23, 490, 243, 135, 18, 218, 387, 13, 166, 140, 1, 86, 24, 14, 138, 286, 164, 152, 7, 49, 237]
2.0


In [11]:
# padding and trimming
X_train = pad_sequences(X_train, maxlen=PAD_LEN)
X_test = pad_sequences(X_test, maxlen=PAD_LEN)

In [12]:
# One hot encoding
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

print(y_train[0])

[1. 0. 0. 0.]


In [13]:
model = Sequential()
model.add(Embedding(NUM_WORDS, EMBEDDING))
model.add(LSTM(EMBEDDING))
model.add(Dense(4, activation='softmax'))

model.summary()

[2021-06-26 19:18:21.370 tensorflow-2-3-gpu--ml-g4dn-xlarge-794be025f5602a375b1b7feb8a0a:2062 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None
[2021-06-26 19:18:21.395 tensorflow-2-3-gpu--ml-g4dn-xlarge-794be025f5602a375b1b7feb8a0a:2062 INFO profiler_config_parser.py:102] Unable to find config at /opt/ml/input/config/profilerconfig.json. Profiler is disabled.
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         50000     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 4)                 404       
Total params: 130,804
Trainable params: 130,804
Non-trainable params: 0
_________________________________________________________________


In [14]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
mc = ModelCheckpoint(f'../model/{MODEL_NAME}.h5', monitor='val_acc', mode='max')

In [15]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

In [16]:
history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE, epochs=30, callbacks=[es, mc],
                    validation_data=(X_test, y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 00013: early stopping
