## 필요한 라이브러리 import

In [None]:
from pykospacing import Spacing
import pandas as pd
from konlpy.tag import Okt
from hanspell import spell_checker
import re

spacing = Spacing()
okt = Okt()

## data.csv 파일 읽기

In [2]:
df = pd.read_csv('pre-data/data.csv',encoding='cp949',
                 low_memory=False)
df = df[['title', '개월']]

In [None]:
df

## 불용어(stopwords) 제거

### 불용어 불러오기

In [None]:
stopwords_df=pd.read_csv('pre-data/stopwords.csv', encoding='utf-8',
                 low_memory=False)
stopwords_df

### 불용어 제거 (nuring)

In [None]:
# nursing
len_df = len(df)
for i in range(len_df):
    for j in range(len(stopwords_df['nursing'])):
        df['title'][i] = re.sub(" "+stopwords_df.loc[j]["nursing"]+" ", " ", df['title'][i])
    print(f"{i} of {len_df}")

In [None]:
df

### 불용어(nursing) 제거한 df를 저장

In [None]:
df.to_csv("post-data/nursing_stopwords_removed.csv", mode='w')
df = pd.read_csv('post-data/nursing_stopwords_removed.csv',low_memory=False)


## 동의어처리 (before spell check)

In [None]:
syn_df = pd.read_csv('pre-data/synonym.csv')
syn_df

In [61]:
for idx in range(len(syn_df['전문가 의미'])):
    if syn_df['전문가 의미'][idx]=="삭제":
        syn_df['synonym'][idx] = "" 

# synonym 행 중 값이 없는 행은 삭제
syn_df = syn_df[syn_df['synonym'].notna()] 
syn_df = syn_df[['word', 'synonym']] 

# index 재구성
len_syn = len(syn_df['synonym'])
syn_index = [i for i in range(len_syn)]
syn_df = syn_df.set_index(pd.Index(syn_index))  

In [None]:
syn_df

In [None]:
len_df = len(df)
for i in range(len_df):
    for j in range(len(syn_df['synonym'])):
        df['title'][i] = re.sub(" "+syn_df.loc[j]["word"]+" ", " "+syn_df.loc[j]["synonym"]+" ", df['title'][i])
    print(f"{i} of {len_df}")

    

In [None]:
df

In [None]:
for i in range(len_df):
    print(f"{i} of {len_df}")
    df['title'][i] = re.sub("삭제", "", df['title'][i])


In [None]:
df

### 동의어처리한 df 저장

In [19]:
df.to_csv("post-data/synyonym_checked_before_spell_chcking.csv", mode='w')

## spell checking

In [None]:
spell_list = []

for i in range(len(df)):
# for i in range(10):
    spell_list.append(spacing(df['title'][i]))
    print(i, spell_list[i])

df['spell'] = spell_list

df.to_csv("post-data/spell_checked.csv", mode='w')


## remove stopwords(korean) after spell checking

In [None]:
kr_stopwords_df=stopwords_df.drop(['nursing'], axis=1).dropna()
kr_stopwords_df

In [None]:
# korean
len_df = len(df) 
for i in range(len_df):
    for j in range(len(kr_stopwords_df['korean'])):
        df['title'][i] = re.sub(" "+kr_stopwords_df.loc[j]["korean"]+" ", " ", df['title'][i])
    print(f"{i} of {len_df}")

In [31]:
df.to_csv("post-data/korean_stopwords_removed.csv", mode='w')

## 동의어처리 (after spell check)

In [None]:
len_df = len(df)
for i in range(len_df):
    for j in range(len(syn_df['synonym'])):
        df['title'][i] = re.sub(syn_df.loc[j]["word"], syn_df.loc[j]["synonym"], df['title'][i])
    print(f"{i} of {len_df}")

In [None]:
for i in range(len_df):
    print(f"{i} of {len_df}")
    df['title'][i] = re.sub("삭제", "", df['title'][i])

### 동의어처리한 df 저장

In [34]:
df.to_csv("post-data/synyonym_checked_after_spell_chcking.csv", mode='w')

## tokenization

In [None]:
df

In [None]:
token_list=[]


def title_tokenization(df, idx):
    # return df['title'].replace([df['title'][idx]], " ".join(
    #     okt.nouns(df['title'][idx])))

    return " ".join(okt.nouns(df['spell'][idx]))

for i in range(len(df)):
    token_list.append(title_tokenization(df, i))
    print(i, token_list[i])

df['token'] = token_list

df.to_csv("post-data/tokeniztioned.csv", mode='w')

In [None]:
df

## making matrix

In [None]:
token_list = []
index_list = []

for i in range(len(df)):
    index_list.append(i)
    try:
        token_list += df['token'][i].split()
        print(f"{i} of {len(df['token'])}")
    except AttributeError:  # token이 없는 경우 예외처리
        print(f"ERROR!\n{i} : {df['token'][i]}")

print(f"중복 제거 전 : {len(token_list)}개")
token_list = list(set(token_list))
print(f"중복 제거 후 : {len(token_list)}개")


In [39]:
print("making matrix_df!")
matrix_df = pd.DataFrame(index=index_list, columns=token_list)

print("nan to 0")
matrix_df = matrix_df.fillna(0)

making matrix_df!
nan to 0


In [None]:
matrix_df

In [None]:
for col in range(len(matrix_df.columns)):
    print(f"{col} of {len(matrix_df.columns)} : {matrix_df.columns[col]}")
    for idx in matrix_df.index:
        try:
            cnt = df['token'][idx].split().count(matrix_df.columns[col])
            print(matrix_df[matrix_df.columns[col]][idx], cnt)
            matrix_df[matrix_df.columns[col]][idx] = cnt
            
        except AttributeError:  # token이 없는 경우 예외처리
            pass

print("saving matrix_df!")
matrix_df.to_csv("post-data/matrix.csv", mode='w')

In [None]:
matrix_df = pd.read_csv('data/matrix.csv',low_memory=False)
matrix_df.drop(['Unnamed: 0'], axis=1, inplace=True)
matrix_df

In [None]:
matrix_df = matrix_df.transpose()
matrix_df

In [None]:
matrix_df['freq'] = matrix_df.sum(axis=1)
matrix_df = matrix_df[['freq']]
matrix_df

In [None]:
freq_df = matrix_df.sort_values('freq' ,ascending=False)
freq_df

print("saving freq_df!")
matrix_df.to_csv("post-data/freq.csv", mode='w', encoding='euc-kr')

In [None]:
freq_df