# 필요한 라이브러리 설치 (Python 3.9)

---

In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import re

from sklearn.model_selection import train_test_split
from keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense,Flatten,Embedding
import pickle


print(tf.__version__) # 2.11.0

2.11.0


# 데이터 전처리

---

In [2]:
def clean_text(text):
    # 한글과 공백만 남기고 나머지 문자 제거
    cleaned_text = re.sub(r'[^가-힣\s]', '', text)
    return cleaned_text.strip()

In [31]:
def load_data(data_name):
    """
    input: (str)
    => data_name: name of raw text csv
    """
    data = pd.read_csv(data_name,encoding='utf-8', index_col=0)

    data = data[~data['text'].str.match('^\d+$')]
    data = data[~data['text'].str.contains('\u200b')]
    
    data['text'] = data['text'].apply(clean_text)
    data = data[data['text'] != '']
    data.reset_index(drop=True, inplace=True)

    return data

data_name = "file-name.csv'
data = load_data(data_name)
docs = data['text']
classes = data['label']
data


Unnamed: 0,text,label,website
0,남은 시간,5,11st
1,리뷰,5,11st
2,개,5,11st
3,종 택,5,11st
4,팩 외 팩 김나운 진꼬리곰탕,5,11st
...,...,...,...
6732,반품,5,temu_home2
6733,문의하기,5,temu_home2
6734,테무 홈페이지,5,temu_home2
6735,판매됨,6,temu_home2


In [9]:
# 데이터프레임에서 텍스트와 라벨 열 분리
docs = data['text']
classes = data['label']

# train / test 분할 (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(docs, classes, test_size=0.2, random_state=42)
y_train_one_hot = pd.get_dummies(y_train)

In [22]:
def preprocessing(text):
    """
    input: (pandas.Series.series or list)
    output: (tokenizer, padded_X)
    """
    token = Tokenizer()
    token.fit_on_texts(text)
    x = token.texts_to_sequences(text)
    padded_x = pad_sequences(x, 16)
    return token, padded_x

token, padded_x = preprocessing(X_train)

In [84]:
# tokenizer 객체를 저장
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

# 모델 빌드 & 훈련

----

In [15]:
def make_model(padded_x, y_train):
    y_train_one_hot = pd.get_dummies(y_train)
    model = Sequential()
    model.add(Embedding(input_dim=max(padded_x.flatten())+1, output_dim=16, input_length=16))
    model.add(Dense(16, activation='relu'))
    model.add(Flatten())
    model.add(Dense(2, activation='sigmoid'))

    model.compile(optimizer='adam', 
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])
    epochs = 10
    model.fit(
        padded_x,
        y_train_one_hot,
        epochs=epochs)

    return model

model = make_model(padded_x, y_train)

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 16, 16)            97392     
                                                                 
 dense_4 (Dense)             (None, 16, 16)            272       
                                                                 
 flatten_2 (Flatten)         (None, 256)               0         
                                                                 
 dense_5 (Dense)             (None, 4)                 1028      
                                                                 
Total params: 98,692
Trainable params: 98,692
Non-trainable params: 0
_________________________________________________________________


In [16]:
epochs = 100
history = model.fit(
    padded_x,
    y_train_one_hot,
    epochs=epochs)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [17]:
# 테스트 세트에 대해서도 전처리하고 패딩
def pre_pad(X_train, token):
    x = token.texts_to_sequences(X_train)
    padded_x = pad_sequences(x, 16)
    return padded_x

def pre_y(y_train):
    y_train_one_hot = pd.get_dummies(y_train)
    return y_train_one_hot

X_test_pad = pre_pad(X_test, token)
y_test_one_hot = pre_y(y_test)

In [18]:
print("\n Accuracy: %.4f" % (model.evaluate(X_test_pad, y_test_one_hot)[1]))


 Accuracy: 0.9918


# 예측하기

In [39]:
def load_json(json_name):
    
    with open(json_name, "r", encoding ='utf-8') as f:
        data_json = json.load(f)
    
    texts = [item['text'] for item in data_json]
    xpaths = [item['xpath'] for item in data_json]
    
    data = pd.DataFrame({
        'text': texts,
        'xpath': xpaths
    })
    return data

json_name = "file-name"
data2 = load_json(json_name)
data2

Unnamed: 0,text,xpath
0,"<iframe src=""https://www.googletagmanager.com/...",/html/body/noscript
1,"이 어플리케이션을 보려면 자바스크립트를 활성화해야 합니다. <img height=""...",/html/body/noscript[2]
2,마트직송,/html/body/div/div/div/header/div/div[2]/div/h2
3,마트직송,/html/body/div/div/div/header/div/div[2]/div/b...
4,즉시배송,/html/body/div/div/div/header/div/div[2]/div/b...
...,...,...
2269,배송시간,/html/body/div/div/nav/div/button[2]/p
2270,홈,/html/body/div/div/nav/div/button[3]/p
2271,마이페이지,/html/body/div/div/nav/div/button[4]/p
2272,자주구매,/html/body/div/div/nav/div/button[5]/p


In [49]:
def print_prediction(model, padded_docs, docs):
    """
    Print the prediciton of padded data
    """
    predictions = model.predict(padded_docs)

    predicted_classes = np.argmax(predictions, axis=1)

    # 다크패턴인 문구만 출력
    for i, label in enumerate(predicted_classes):
        if label != 0:
            original_text = docs.iloc[i]  
            print(f"Predicted label of Sample {i}: {label}, Original text: {original_text},")

In [50]:
# Predict label 
docs2 = data2['text']
test_token = token.texts_to_sequences(docs2)
padded_docs = pad_sequences(test_token, 16)
print_prediction(model, padded_docs, docs2)

Predicted label of Sample 37: 2, Original text: 최저가 도전,
Predicted label of Sample 113: 1, Original text: 함께할인,
Predicted label of Sample 146: 1, Original text: 함께할인,
Predicted label of Sample 331: 1, Original text: 함께할인,
Predicted label of Sample 1904: 1, Original text: 함께할인,
Predicted label of Sample 1924: 2, Original text: 함께할인 상품선택,


In [92]:
#Save model as .h5
model.save('clf_model.h5')

------