In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re
import urllib.request
import mecab
from tqdm import tqdm
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('./df_label_O.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17410 entries, 0 to 17409
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   labels  17410 non-null  float64
 1   title   17410 non-null  object 
dtypes: float64(1), object(1)
memory usage: 272.2+ KB


# 불용어 처리

## 데이터 로드

In [3]:
len(df)

17410

In [4]:
df.head(2)

Unnamed: 0,labels,title
0,1.0,새로운 생산공장으로 인해 회사는 예상되는 수요 증가를 충족시킬 수 있는 능력을 증가...
1,1.0,"2009-2012년 회사의 업데이트된 전략에 따르면, Basware는 20% - 4..."


In [5]:
mecab = mecab.MeCab()

In [6]:
df['tokenized'] = df['title'].apply(mecab.morphs)

In [7]:
# df['tokenized'].iloc[0]

In [8]:
df.head(3)

Unnamed: 0,labels,title,tokenized
0,1.0,새로운 생산공장으로 인해 회사는 예상되는 수요 증가를 충족시킬 수 있는 능력을 증가...,"[새로운, 생산, 공장, 으로, 인해, 회사, 는, 예상, 되, 는, 수요, 증가,..."
1,1.0,"2009-2012년 회사의 업데이트된 전략에 따르면, Basware는 20% - 4...","[2009, -, 2012, 년, 회사, 의, 업데이트, 된, 전략, 에, 따르, ..."
2,1.0,ASPOCOMP의 성장기에 대한 자금 조달은 기술적으로 더 까다로운 HDI 인쇄 회...,"[ASPOCOMP, 의, 성장기, 에, 대한, 자금, 조달, 은, 기술, 적, 으로..."


In [9]:
tokenized = []
for i in range(len(df)):
    # for 
    words=df.iloc[i,2]
    tokenized.append(words)

In [10]:
tokenized[0][0]

'새로운'

In [11]:
len(tokenized)

17410

In [12]:
words_list = []
for words in tokenized:
    for word in words:
        words_list.append(word)

In [13]:
words_list[0]

'새로운'

In [14]:
len(words_list)

336491

In [15]:
words_list = pd.DataFrame(words_list)
# words_list = words_list.sort_values(by=[0])

In [16]:
words_list.tail(2)

Unnamed: 0,0
336489,사업
336490,강화


In [17]:
word_cnt = words_list.value_counts()

In [18]:
word_cnt = pd.DataFrame(word_cnt)

In [19]:
word_cnt.head(2)

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
",",10780
.,9486


In [20]:
word_cnt.rename(columns = {0 : 'cnt'}, inplace = True)

In [21]:
word_cnt.head(2)

Unnamed: 0_level_0,cnt
0,Unnamed: 1_level_1
",",10780
.,9486


In [22]:
word_cnt = word_cnt.reset_index()

In [23]:
word_cnt.rename(columns = {0 : 'word'}, inplace = True)

In [24]:
word_cnt.head(2)

Unnamed: 0,word,cnt
0,",",10780
1,.,9486


In [25]:
word_cnt = word_cnt.sort_values(by=['word'])
word_cnt.head(2)

Unnamed: 0,word,cnt
979,!,44
14,"""",2584


In [26]:
word_cnt = word_cnt.reset_index()


In [27]:
del word_cnt['index']

In [28]:
word_cnt.head(2)

Unnamed: 0,word,cnt
0,!,44
1,"""",2584


In [29]:
len(word_cnt)

18261

In [30]:
word_cnt['word'][0]

'!'

## stop_word_list

In [31]:
stop_word_list =[]

In [32]:
with open('./stopwords-ko.txt', "rt",encoding='UTF8') as f:
    for line in f:
        # print(line.strip())
        stop_word_list.append(line.strip())

In [33]:
len(stop_word_list)

679

In [34]:
stop_word_list[:5]

['!', '"', '$', '%', '&']

### 1차 제거
* 18261 -> 18054

In [35]:
# arr = [6, 5, 6, 4, 4, 1, 1, 2, 3, 9, 8, 7, 9, 8, 7]
# result = [] # 중복 제거된 값들이 들어갈 리스트

for i, value in enumerate(word_cnt['word']):
    if value  in stop_word_list:
        word_cnt = word_cnt.drop(i, axis=0)

print(len(word_cnt))

18054


In [36]:
# word_cnt.head(50)

## 한 글자

In [37]:
one_letter=[]
for word in word_cnt['word']:
    if len(word)==1:
        one_letter.append(word)
len(one_letter)

1051

In [38]:
# one_letter[650:700]

In [39]:
# word_cnt.head(10)

In [40]:
word_cnt = word_cnt.reset_index()

In [41]:
del word_cnt['index']

In [42]:
word_cnt.head()

Unnamed: 0,word,cnt
0,"""'",9
1,"""(",15
2,""")",5
3,"""+_",1
4,""",",3


### 2차제거
* 18054 -> 17003

In [46]:
del_idx = []
for i, word in enumerate(word_cnt['word']):
    if word in one_letter:
        del_idx.append(i)

In [47]:
len(del_idx)

1051

In [48]:
word_cnt = word_cnt.drop(del_idx, axis=0)
len(word_cnt)

17003

In [None]:
# for word_idx in range(len(word_cnt)):
#     print(word_cnt.iloc[word_idx,0])
#     # if len(word['word'])==1:
#         # word_cnt=word_cnt.delete(word)
#         # print(word['word'])

## 정규표현식

In [49]:
import re

### 기호

In [86]:
# word_cnt['word'][250:300]

In [76]:
print(word_cnt['word'])

0         "'
1         "(
2         ")
3        "+_
4         ",
        ... 
18045     힘쓰
18046     힘쓴
18047     힘쓸
18048     힘입
18049     힘주
Name: word, Length: 17003, dtype: object


In [96]:
word_cnt.word.str.findall('!')

0        []
1        []
2        []
3        []
4        []
         ..
18045    []
18046    []
18047    []
18048    []
18049    []
Name: word, Length: 17003, dtype: object

In [106]:
특수기호_idx=word_cnt[word_cnt.word.str.contains('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\"]')].index

### 3차제거
*17003 -> 16845

In [107]:
word_cnt = word_cnt.drop(특수기호_idx, axis=0)

In [108]:
len(word_cnt)

16845

In [116]:
word_cnt[word_cnt['cnt']<2]

Unnamed: 0,word,cnt
110,000020,1
111,000063,1
112,001,1
114,001067,1
116,0025,1
...,...,...
18039,힐러,1
18040,힐링,1
18045,힘쓰,1
18046,힘쓴,1


In [119]:
word_cnt.to_csv('./word_list.csv', index=False)