# Industry 4.0 의 중심, BigData

<div align='right'><font size=2 color='gray'>Data Processing Based Python @ <font color='blue'><a href='https://www.facebook.com/jskim.kr'>FB / jskim.kr</a></font>, [김진수](bigpycraft@gmail.com)</font></div>
<hr>

## <font color='brown'>워드클라우드 : 이미지마스킹 - 한글 버전 </font>

#### Kkma 형태소분석기
> from konlpy.tag import Kkma
> - kkma.pos(phrase, flatten=True) : POS tagger
> - kkma.sentences(phrase) : Sentence detection. 문장 검색
> - kkma.nouns(phrase) : Noun extractor. 명사 추출기
> - kkma.morphs(phrase) : Parse phrase to morphemes. 형태소 분석 문구
> - kkma.tagset : dict <br> {'EC': '연결 어미', 'ECD': '의존적 연결 어미', 'ECE': '대등 연결 어미', 'ECS': '보조적 연결 어미', 'EF': '종결 어미', 'EFA':  <...> 체언 접두사', 'XPV': '용언 접두사', 'XR': '어근', 'XSA': '형용사 파생 접미사', 'XSN': '명사파생 접미사', 'XSV': '동사 파생 접미사'}

In [None]:
import nltk
from konlpy.tag import Kkma

In [None]:
kkma = Kkma()

In [None]:
text = """
유시민 항소이유서 중 네크라소프의 인용구입니다. 
슬픔도 노여움도 없이 살아가는 자는 조국을 사랑하고 있지 않다
"""

In [None]:
kkma.sentences(text)

In [None]:
kkma.nouns(text)

In [None]:
kkma.morphs(text)

In [None]:
kkma.pos(text)

In [None]:
kkma.pos("하이, 꼬꼬마 pos로 형태소 분석을 해보자.")

In [None]:
text = '안녕하세요? 자연어처리 입문 과정입니다. 문장에서 명사를 빼어 보십시오. 명사만~'

In [None]:
kkma.nouns(text)

In [None]:
kkma.sentences(text)

In [None]:
kkma.pos(text)

### <font color='brown'> 대한민국헌법 형태소분석 </font>
> from konlpy.tag import Kkma

In [None]:
with open('wc_docu/대한민국헌법.txt') as fp:
    text = fp.read()

len(text)

In [None]:
text

In [None]:
txt_pos = kkma.pos(text)
txt_pos

In [None]:
txt_nouns = kkma.nouns(text)
txt_nouns

In [None]:
txt_morphs = kkma.morphs(text)
txt_morphs

In [None]:
txt_sentences = kkma.sentences(text)
txt_sentences

In [None]:
result = """
대한민국헌법 형태소분석 결과입니다. 
    - POS태그  \t: {}개 
    - 형태소   \t: {}개 
    - 명사갯수 \t: {}개
    - 문장갯수 \t: {}개
 """.format(len(txt_pos), len(txt_morphs), len(txt_nouns), len(txt_sentences))
print(result)

### <font color='brown'>KoNLPy.tag : Twitter or Okt</font>

#### Twitter 형태소분석기
> from konlpy.tag import Twitter
- twitter.jki(*args, **kwargs)
- twitter.pos(phrase, norm=False, stem=False) : POS tagger
- twitter.phrases(phrase) : Phrase extractor. 구문 추출기
- twitter.nouns(phrase) : Noun extractor. 명사 추출기
- twitter.morphs(phrase, norm=False, stem=False) : Parse phrase to morphemes. 형태소 분석 문구
- twitter.tagset : dict <br> {'Adjective': '형용사', 'Adverb': '부사', 'Alpha': '알파벳', 'Conjunction': '접속사', 'Determiner': '관형사', ' <...>  'Punctuation': '구두점', 'ScreenName': '트위터 아이디', 'Suffix': '접미사', 'Unknown': '미등록어', 'Verb': '동사'}

In [None]:
# 대한민국 헌법 내장 데이터
from konlpy.corpus import kolaw

In [None]:
from konlpy.tag import Okt

In [None]:
okt = Okt()

In [None]:
constitution = kolaw.open('constitution.txt').read()
constitution[:1000]

In [None]:
# 트위터 형태소 분석 후 명사만 추출
constitution_nouns = okt.nouns(constitution)
len(constitution_nouns)

In [None]:
constitution_nouns[:10]

In [None]:
one_words = []
tow_words = []
etc_words = []

for noun in constitution_nouns:
    if len(noun) == 1:
        one_words.append(noun)
    elif len(noun) == 2:
        tow_words.append(noun)
    else:
        etc_words.append(noun)
        
len(one_words), len(tow_words), len(etc_words)

In [None]:
one_words = list(set(one_words))
tow_words = list(set(tow_words))
etc_words = list(set(etc_words))

len(one_words), len(tow_words), len(etc_words)

In [None]:
for word in one_words:
    print(word, end='\t')

In [None]:
stop_words = []
# stop_words = ['제', '월', '일','조','수','때','그','이','바','및','안']
stop_words.extend(one_words)
len(stop_words)

In [None]:
# stop_words.index('법')
# stop_words.remove('법')
# len(stop_words)

In [None]:
# stop_words 빼기
constitution_nouns = [ word for word in constitution_nouns if word not in stop_words ]
len(constitution_nouns)

In [None]:
constitution_nouns[:10]

In [None]:
constitution  = nltk.Text(constitution_nouns, name='헌법')
type(constitution), len(constitution)

In [None]:
# 상위 30개 수 
constitution.vocab().most_common(30)

### <font color='brown'>워드클라우드 한글폰트 설정</font>

In [None]:
from matplotlib import font_manager, rc

font_name = font_manager.FontProperties(fname="C:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)

In [None]:
# nltk패키지를 통해 konlpy에서 뽑은 명사들 상위 500개 추출
data = constitution.vocab().most_common(500)
len(data)

In [None]:
data[:10]

In [None]:
# data를 딕셔너리 형태로 변환
dict_data = dict(data)
len(dict_data)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
wordcloud = WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
                      background_color='white'
                     ,relative_scaling =0.2).generate_from_frequencies(dict_data)

In [None]:
plt.figure(figsize=(20,12))
plt.imshow(wordcloud)
plt.axis('off')

In [None]:
params = {
    "width"            : 800,
    "height"           : 600,
    "min_font_size"    : 2**3, 
    "max_font_size"    : 2**7, 
    "max_words"        : 100,
    "relative_scaling" : 0.2,
    # "font_path"        : 'C:/Windows/Fonts/malgun.ttf',
    "font_path"        : 'C:/Windows/Fonts/HMFMMUEX.TTC',
    "background_color" : 'white',
}

wordcolud = WordCloud(**params)
# wordcolud = wordcolud.generate(text) 
wordcolud = wordcolud.generate_from_frequencies(dict_data) 

save_img = 'wordcloud/kor_constitution_1.png'

plt.figure(figsize=(24,16))
plt.imshow(wordcolud)
plt.axis('off')
plt.savefig(save_img)

### <font color='brown'>사진+워드클라우드</font>

In [None]:
import numpy as np
from PIL import Image
from wordcloud import ImageColorGenerator

In [None]:
# 이미지파일 numpy로 읽기
shape_img = np.array(Image.open('images/mask_ahn_palm.jpg'))
cloud_img = ImageColorGenerator(shape_img)

In [None]:
plt.figure(figsize=(12,12))
plt.imshow(shape_img)
plt.axis('off')

In [None]:
wordcloud = WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
                     relative_scaling = 0.2,
                     mask = shape_img, 
                     background_color = 'white',
                     min_font_size=1, max_font_size=40).generate_from_frequencies(dict_data)

In [None]:
save_img = 'wordcloud/kor_constitution_2.png'
plt.figure(figsize=(12, 12))
plt.imshow(wordcloud.recolor(color_func=cloud_img))
plt.axis('off')
plt.savefig(save_img)

In [None]:
shape_img = np.array(Image.open('images/mask_ahn_palm.jpg'))
shape_img

In [None]:
shape_img = np.array(Image.open('images/mask_ahn_palm.jpg'))
cloud_img = ImageColorGenerator(shape_img)

params = {
    "width"            : 300,
    "height"           : 400, 
    "max_words"        : 100,
    "relative_scaling" : 0.2,
    "background_color" : 'white',
    "font_path"        : 'C:/Windows/Fonts/malgun.ttf',
    "mask"             :  shape_img, 
}

wordcolud = WordCloud(**params)
# wordcolud = wordcolud.generate(text) 
wordcolud = wordcolud.generate_from_frequencies(dict_data) 

save_img = 'wordcloud/kor_constitution_3.png'
plt.figure(figsize=(20,20))
plt.imshow(wordcloud.recolor(color_func=cloud_img))
plt.axis('off')
plt.savefig(save_img)

<hr>

### <font color='brown'>워드클라우드 실습</font>
> 
- KoNLPy를 활용해서 워드클라우드 직접 작성해보기~^^

In [None]:
import nltk
from konlpy.tag import Kkma

In [None]:
kkma = Kkma()

In [None]:
# text = open('./wc_docu/pyhthon_kakao_the_love.txt', encoding = 'UTF-8').read()
with open('./wc_docu/pyhthon_kakao_the_love.txt', encoding = 'UTF-8') as fp:
    text = fp.read()

In [None]:
kkma.pos(text)

In [None]:
kkma.nouns(text)

In [None]:
kakao_nouns = kkma.nouns(text)
stop_words  = ['오후','오전','?','[', ']','(이모티콘)','님']
kakao_nouns = [ word for word in kakao_nouns if word not in stop_words]
kakao_nouns

In [None]:
# from wordcloud import WordCloud
import wordcloud
import matplotlib.pyplot as plt

In [None]:
katalk  = nltk.Text(kakao_nouns)
data = katalk.vocab().most_common(400)
dict_data = dict(data)

In [None]:
wc_gen = wordcloud.WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
                             relative_scaling =0.2, width=1600, height=900
                            ).generate_from_frequencies(dict_data)
plt.figure(figsize=(12,12))
plt.imshow(wc_gen)
plt.axis('off')
plt.savefig('wordcloud/katalk_love.jpg')

In [None]:
wc_gen = wordcloud.WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
                             relative_scaling =0.5, width=2000, height=2000
                            ).generate_from_frequencies(dict_data)
plt.figure(figsize=(20,20))
plt.imshow(wc_gen)
plt.axis('off')
plt.savefig('wordcloud/katalk_love_2.jpg')

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

from PIL import Image
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator

In [None]:
mask_heart  = np.array(Image.open('images/heart.jpg'))
image_color = ImageColorGenerator(mask_heart)
plt.figure(figsize=(12,12))
plt.imshow(mask_heart)
plt.axis('off')

In [None]:
wordcloud = WordCloud(font_path='C:/Windows/Fonts/malgun.ttf',
                        relative_scaling = 0.2,
                        mask = mask_heart, 
                        background_color = 'white',
                        min_font_size=1, 
                        max_font_size=15, 
                        width=1600, 
                        height=900).generate_from_frequencies(dict_data)

In [None]:
plt.figure(figsize=(10,10))
plt.imshow(wordcloud.recolor(color_func=image_color))
plt.axis('off')
plt.savefig('wordcloud/katalk_heart_1.png')


In [None]:
params = {
    "font_path"         :'C:/Windows/Fonts/malgun.ttf',
    "relative_scaling"  : 0.2,
    "mask"              : mask_heart,
    "background_color"  : 'pink',
    "max_words"         : 200,
    "min_font_size"     : 6,
    "max_font_size"     : 24,
    "width"             : 1200,
    "height"            : 900,
}

wordcloud = WordCloud(**params).generate_from_frequencies(dict_data)

plt.figure(figsize=(18,12))
plt.imshow(wordcloud.recolor(color_func=image_color))
plt.axis('off')
plt.savefig('wordcloud/katalk_heart_2.png')

<hr>
<marquee><font size=3 color='brown'>The BigpyCraft find the information to design valuable society with Technology & Craft.</font></marquee>
<div align='right'><font size=2 color='gray'> &lt; The End &gt; </font></div>