# **NLP(Natural Language Processing, 자연어 처리)**

# **Settings**

In [50]:
import os
# os.chdir('/content/drive/MyDrive/Colab Notebooks/메타버스 아카데미/Data/')
import warnings
warnings.filterwarnings('ignore')                       # warning 출력 false

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import koreanize_matplotlib

# **1. Bag Of Words(BOW)**

## ***! Note Bag of Words***
* Bag of Words(BOW): 출현 빈도를 고려.
* DTM : 여러 문장에 대한 Bow를 Matrix로 나타낸 것
* 빈도가 많이 쓰일수록 숫자가 커지기 때문에 중요한 단어라고 인식해버릴 수도 있다. → TF-IDF 등장

## **방법 1. 직접 구현하기**

In [2]:
! pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.4.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (465 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.3/465.3 kB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.4.1 konlpy-0.6.0


In [4]:
from konlpy.tag import Okt
from keras.preprocessing.text import Tokenizer

okt = Okt()
tokenizer = Tokenizer()

# 형태소 분석
text = '나는 오늘 파이썬을 공부하고 있습니다. 파이썬은 정말 재미있는 공부입니다.'
doc = text.replace('.',' ')
tokenized_doc = okt.morphs(doc)
print(tokenized_doc)

# tokenizer
word_to_index = {}
bow = []

for word in tokenized_doc:
    if word not in word_to_index.keys():
        word_to_index[word] = len(word_to_index)
        bow.insert(len(word_to_index)-1, 1)
    else:
        index = word_to_index.get(word)
        bow[index] += 1

print(word_to_index)
print(bow)

['나', '는', '오늘', '파이썬', '을', '공부', '하고', '있습니다', '파이썬', '은', '정말', '재미있는', '공부', '입니다']
{'나': 0, '는': 1, '오늘': 2, '파이썬': 3, '을': 4, '공부': 5, '하고': 6, '있습니다': 7, '은': 8, '정말': 9, '재미있는': 10, '입니다': 11}
[1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1]


## **방법 2. Sklearn**

In [9]:
# DTM 만들기
from sklearn.feature_extraction.text import CountVectorizer

text = [
    ['누구나','한번쯤','사랑','웃고'],
    ['누구나','한번쯤','사랑','울고'],
    ['그것은','바로','사랑','사랑','사랑']
]

count_vec = CountVectorizer(tokenizer=lambda x: x, lowercase=False)
result = count_vec.fit_transform(text)
print(count_vec.vocabulary_)
print('DTM')
print(result.toarray())

{'누구나': 1, '한번쯤': 6, '사랑': 3, '웃고': 5, '울고': 4, '그것은': 0, '바로': 2}
DTM
[[0 1 0 1 0 1 1]
 [0 1 0 1 1 0 1]
 [1 0 1 3 0 0 0]]


# **2. TF-IDF**

* 문장마다 나오는 단어같은 경우에는 의미 없는 단어일 가능성이 높다. 이 경우 가중치를 적게 주도록 하는 원리

## **방법 1. 직접 구현하기**

In [41]:
from math import log
okt = Okt()

text = '''
나는 영어를 좋아합니다.
영어, 수학을 매일 공부합니다.
과학 공부를 좋아하고 수학도 좋아합니다.
영어,수학,과학을 좋아 하고 영어는 매일 공부합니다.
'''

words = okt.morphs(text.replace('\n',' '))
words = [x for x in words if len(x) > 1]
print(words)

result = list(set(words))
print(result)

docs = text.split('.')
doc_list = []
for doc in docs:
    if doc.strip() != '':
        doc_list.append(doc.strip())
print(doc_list)

['영어', '좋아합니다', '영어', '수학', '매일', '공부', '합니다', '과학', '공부', '좋아하고', '수학', '좋아합니다', '영어', '수학', '과학', '좋아', '하고', '영어', '매일', '공부', '합니다']
['합니다', '공부', '영어', '매일', '좋아하고', '좋아합니다', '수학', '좋아', '하고', '과학']
['나는 영어를 좋아합니다', '영어, 수학을 매일 공부합니다', '과학 공부를 좋아하고 수학도 좋아합니다', '영어,수학,과학을 좋아 하고 영어는 매일 공부합니다']


In [42]:
N = len(doc_list)

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in doc_list:
        df += t in doc
    return log(N / (df+1))

def tfidf(t,d):
    return tf(t,d) * idf(t)

In [43]:
word_list = result
t_result = []

for temp in range(N):
    t_result.append([])
    doc = doc_list[temp]

    for temp1 in range(len(word_list)):
        t = word_list[temp1]
        t_result[-1].append(tf(t,doc))

print(t_result)

[[1, 0, 1, 0, 0, 1, 0, 1, 0, 0], [1, 1, 1, 1, 0, 0, 1, 0, 0, 0], [1, 1, 0, 0, 1, 1, 1, 2, 1, 1], [1, 1, 2, 1, 0, 0, 1, 1, 1, 1]]


In [44]:
df = pd.DataFrame(t_result, columns=word_list)
df

Unnamed: 0,합니다,공부,영어,매일,좋아하고,좋아합니다,수학,좋아,하고,과학
0,1,0,1,0,0,1,0,1,0,0
1,1,1,1,1,0,0,1,0,0,0
2,1,1,0,0,1,1,1,2,1,1
3,1,1,2,1,0,0,1,1,1,1


In [45]:
tfidf_result = []
for temp in range(len(word_list)):
    t = word_list[temp]
    tfidf_result.append(idf(t))

print(tfidf_result)

[-0.2231435513142097, 0.0, 0.0, 0.28768207245178085, 0.6931471805599453, 0.28768207245178085, 0.0, 0.0, 0.28768207245178085, 0.28768207245178085]


In [47]:
df_idf = pd.DataFrame(tfidf_result, index=word_list, columns=['IDF'])
df_idf

Unnamed: 0,IDF
합니다,-0.223144
공부,0.0
영어,0.0
매일,0.287682
좋아하고,0.693147
좋아합니다,0.287682
수학,0.0
좋아,0.0
하고,0.287682
과학,0.287682


In [49]:
result = []

for temp in range(N):
    result.append([])
    doc = doc_list[temp]
    for temp in range(len(word_list)):
        t = word_list[temp]
        result[-1].append(tfidf(t,doc))

tfidf_df = pd.DataFrame(result, columns=word_list)
tfidf_df

Unnamed: 0,합니다,공부,영어,매일,좋아하고,좋아합니다,수학,좋아,하고,과학
0,-0.223144,0.0,0.0,0.0,0.0,0.287682,0.0,0.0,0.0,0.0
1,-0.223144,0.0,0.0,0.287682,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.223144,0.0,0.0,0.0,0.693147,0.287682,0.0,0.0,0.287682,0.287682
3,-0.223144,0.0,0.0,0.287682,0.0,0.0,0.0,0.0,0.287682,0.287682
