In [42]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from konlpy.tag import *

# Study data load

In [27]:
train_data = pd.read_excel('data/study_data.xlsx')

In [28]:
train_data

Unnamed: 0.1,Unnamed: 0,category_seq1,product_seq,product_title
0,0,13,36578723,Dji mg-1
1,1,2,28981914,(가격내림)마인드브리지 투인원 니트(95)
2,2,2,28366322,트루젠 트렌치코트(95)
3,3,2,29423509,라모다원피스팝니다
4,4,1,29045638,프라다 원숄더 포코노 슬링백 2vz047
...,...,...,...,...
9995,9995,3,36569395,라코스테 백팩
9996,9996,13,36569394,[신한]터치트윈마카세트 60색 A
9997,9997,15,35064579,"용평리조트 4월,5월,6월 주중주말 예약양도"
9998,9998,7,36569393,레이저 블랙샤크v2 풀구성 (무료배송)


In [29]:
len(train_data)

10000

중복 데이터 확인

In [30]:
train_data['product_title'].nunique()

9345

중복 데이터 제거 및 Null 값 제거

In [31]:
train_data.drop_duplicates(subset=['product_title'], inplace=True) # 중복 제거

In [32]:
train_data = train_data.dropna(how='any') # null 값이 존재하는행 제거

In [33]:
print('샘플 데이터 수 : ', len(train_data))

샘플 데이터 수 :  9345


정규 표현식을 이용하여 한글 영어 숫자만 남기고 제거

In [34]:
train_data['product_title'] = train_data['product_title'].str.replace('[^ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z0-9-_+ ]', '')

In [35]:
train_data

Unnamed: 0.1,Unnamed: 0,category_seq1,product_seq,product_title
0,0,13,36578723,Dji mg-1
1,1,2,28981914,가격내림마인드브리지 투인원 니트95
2,2,2,28366322,트루젠 트렌치코트95
3,3,2,29423509,라모다원피스팝니다
4,4,1,29045638,프라다 원숄더 포코노 슬링백 2vz047
...,...,...,...,...
9995,9995,3,36569395,라코스테 백팩
9996,9996,13,36569394,신한터치트윈마카세트 60색 A
9997,9997,15,35064579,용평리조트 4월5월6월 주중주말 예약양도
9998,9998,7,36569393,레이저 블랙샤크v2 풀구성 무료배송


empty value -> Null로 변경

In [37]:
train_data['product_title'].replace('', np.nan, inplace=True)
print(train_data.isnull().sum())

Unnamed: 0       0
category_seq1    0
product_seq      0
product_title    0
dtype: int64


In [38]:
train_data = train_data.dropna(how = 'any')
print(len(train_data))

9345


불용어 제거 및 형태소 분석

In [52]:
stopwords = ['팝니다', '팔아요', '팜', '판매']

In [53]:
okt = Okt()

In [54]:
x_train = []

In [55]:
for sentence in train_data['product_title']:
    temp = okt.morphs(sentence, stem=True) # 토큰화
    temp = [word for word in temp if not word in stopwords] # 불용어 제거
    x_train.append(temp)

In [67]:
x_train[:5]

[['Dji', 'mg', '-', '1'],
 ['가격', '내리다', '마인드', '브리지', '투', '인원', '니트', '95'],
 ['트루', '젠', '트렌치코트', '95'],
 ['라모', '다', '원피스', '파다'],
 ['프라다', '원숄더', '포코', '노', '슬링', '백', '2', 'vz', '047']]

In [57]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x_train)

In [69]:
print(tokenizer.word_index)

{'파다': 1, '하다': 2, '갤럭시': 3, '새': 4, 's': 5, '블랙': 6, '상품': 7, '2': 8, 'g': 9, 'a': 10, '가': 11, '+': 12, '-': 13, '급': 14, 'gb': 15, '사이즈': 16, '256': 17, '20': 18, '3': 19, '기': 20, '폰': 21, '제품': 22, '미개': 23, '봉': 24, '1': 25, '자켓': 26, '5': 27, '4': 28, '용': 29, '128': 30, '정품': 31, '에': 32, '권': 33, '10': 34, '노트': 35, '삼성': 36, 'm': 37, '백': 38, '세트': 39, '원피스': 40, '인치': 41, '나이키': 42, '아이폰': 43, '상': 44, '화이트': 45, 'lg': 46, '프로': 47, '이': 48, '플러스': 49, '택포': 50, '7': 51, '64': 52, '8': 53, '울트라': 54, '매입': 55, '100': 56, 'l': 57, 'x': 58, '계': 59, '리퍼': 60, '공기': 61, '남성': 62, 'ㅣ': 63, '가격': 64, 'v': 65, '블루': 66, '무잔': 67, '반': 68, '가능': 69, '중고': 70, '센터': 71, '미니': 72, '개': 73, '그레이': 74, '아': 75, '6': 76, '일괄': 77, '무료': 78, '32': 79, '니': 80, '21': 81, '지': 82, 'mm': 83, '제': 84, '크다': 85, 'k': 86, '티': 87, '판': 88, '정상': 89, '배터리': 90, '9': 91, '라': 92, '즈': 93, '여성': 94, '니트': 95, '스': 96, '장': 97, '고': 98, 'i': 99, '50': 100, '박스': 101, '로': 102, '식': 103, '포함': 104,

In [59]:
print(tokenizer.word_counts)

OrderedDict([('dji', 2), ('mg', 11), ('-', 456), ('1', 307), ('가격', 163), ('내리다', 54), ('마인드', 3), ('브리지', 1), ('투', 25), ('인원', 1), ('니트', 124), ('95', 89), ('트루', 3), ('젠', 50), ('트렌치코트', 26), ('라모', 6), ('다', 77), ('원피스', 227), ('파다', 1201), ('프라다', 26), ('원숄더', 1), ('포코', 4), ('노', 20), ('슬링', 2), ('백', 235), ('2', 552), ('vz', 1), ('047', 1), ('더마펌', 1), ('에이지', 3), ('리바이', 2), ('빙', 2), ('바', 43), ('이탈', 2), ('에멀전', 1), ('a', 532), ('4', 280), ('200', 24), ('ml', 63), ('단', 40), ('블랙', 573), ('라비', 4), ('앙', 7), ('콜라겐', 6), ('앰플', 9), ('캠핑', 20), ('낚시', 18), ('이글루', 1), ('아이스박스', 2), ('46', 14), ('l', 171), ('사', 62), ('용감', 1), ('약간', 3), ('있다', 16), ('깨끗', 30), ('갤럭시', 922), ('s', 858), ('20', 385), ('플러스', 183), ('256', 400), ('기', 336), ('가', 515), ('sssss', 7), ('급', 456), ('0328-12', 1), ('밉다', 92), ('착용', 16), ('새', 913), ('상품', 569), ('스퀘어', 10), ('넥', 21), ('연', 16), ('청', 23), ('데님', 47), ('머메이드', 2), ('롱', 59), ('버즈', 44), ('레드', 98), ('충전기', 28), ('본체', 79), ('하다', 11

In [63]:
x_train[:5]

[['Dji', 'mg', '-', '1'],
 ['가격', '내리다', '마인드', '브리지', '투', '인원', '니트', '95'],
 ['트루', '젠', '트렌치코트', '95'],
 ['라모', '다', '원피스', '파다'],
 ['프라다', '원숄더', '포코', '노', '슬링', '백', '2', 'vz', '047']]

In [61]:
encoded = tokenizer.texts_to_sequences(x_train)

In [64]:
encoded[:5]

[[4076, 1145, 13, 25],
 [64, 240, 3165, 5845, 508, 5846, 95, 134],
 [3166, 258, 488, 134],
 [1946, 158, 40, 1],
 [489, 5847, 2641, 658, 4077, 38, 8, 5848, 5849]]

In [66]:
tokenizer.word_index['가격']

64