# 한글 폰트 설치

In [None]:
# 폰트 설치
!apt-get update -qq # 나눔고딕 설치
!apt-get install fonts-nanum* -qq

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# 폰트 로딩
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

# Install

In [None]:
!pip install sentencepiece

# Evn

In [None]:
import os
import random
import shutil
import json
import zipfile
import math
import copy
import collections
import re

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sentencepiece as spm
import tensorflow as tf
import tensorflow.keras.backend as K

from tqdm.notebook import tqdm

In [None]:
from sklearn.decomposition import PCA

In [None]:
# random seed initialize
random_seed = 1234
random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

In [None]:
!nvidia-smi

# Gensim
# https://radimrehurek.com/gensim/index.html

In [None]:
import gensim
import gensim.downloader as api

# Tutorial

In [None]:
# 이미 학습된 model download
# wv = api.load('word2vec-google-news-300') # 1.6G
wv = api.load('glove-wiki-gigaword-100') # 128M

In [None]:
# vocab 개수 및 최초 20개 출력
print(f"len: {len(wv.vocab)}")
for i, word in enumerate(wv.vocab):
    if i >= 20:
        break
    print(f"{i:2d}: {word}")

In [None]:
wv.most_similar('obama')

In [None]:
wv.most_similar('banana')

In [None]:
wv.most_similar('apple')

In [None]:
# king - man + woman by
result = wv.most_similar(positive=['woman', 'king'], negative=['man'])
result

In [None]:
def analogy(p1, n1, p2):
    result = wv.most_similar(positive=[p2, p1], negative=[n1])
    return result

In [None]:
# japanese - japan + australia
analogy('japanese', 'japan', 'australia')

In [None]:
# beer - australia + france
analogy('beer', 'australia', 'france')

In [None]:
# clinton - reagan + obama
analogy('clinton', 'reagan', 'obama')

In [None]:
wv.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
def display_pca_scatterplot(model, words=None, n_sample=100):
    if not words:
        words = np.random.choice(list(model.vocab.keys()), n_sample)
    word_vectors = np.array([model[w] for w in words])

    # 폰트
    font_name = "NanumBarunGothic"

    # plot 크기 및 폰트 설정
    plt.rc('font', family=font_name)
    plt.rcParams["axes.unicode_minus"] = False # 한글 폰트 사용시 - 깨지는 문제 해결

    word_vectors = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(12,12))
    plt.scatter(word_vectors[:,0], word_vectors[:,1], edgecolors='k', c='r')
    for word, (x, y) in zip(words, word_vectors):
        plt.text(x, y, word)
    plt.show()

In [None]:
display_pca_scatterplot(wv, words=['coffee', 'tea', 'beer', 'wine', 'brandy', 'rum', 'champagne', 'water',
                         'spaghetti', 'borscht', 'hamburger', 'pizza', 'falafel', 'sushi', 'meatballs',
                         'dog', 'horse', 'cat', 'monkey', 'parrot', 'koala', 'lizard',
                         'frog', 'toad', 'monkey', 'ape', 'kangaroo', 'wombat', 'wolf',
                         'france', 'germany', 'hungary', 'luxembourg', 'australia', 'fiji', 'china',
                         'homework', 'assignment', 'problem', 'exam', 'test', 'class',
                         'school', 'college', 'university', 'institute'])

In [None]:
display_pca_scatterplot(wv)

# 한국어 학습
# https://wikidocs.net/50739

In [None]:
# 행태소분석기 설치
!set -x \
&& pip install konlpy \
&& curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh | bash -x

In [None]:
import konlpy
okt = konlpy.tag.Okt()

In [None]:
okt.morphs("아버지가방에들어가신다")

In [None]:
# 네이버 영화 리뷰 데이터 다운로드
!wget https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt

In [None]:
nsmc_data = pd.read_csv("ratings.txt", header=0, delimiter='\t', quoting=3)
print(f"전체 데이터의 개수: {len(nsmc_data)}")
nsmc_data.head(10)

In [None]:
# null 제거
nsmc_data = nsmc_data.dropna()
print(f"null 제거 후 데이터의 개수: {len(nsmc_data)}")
nsmc_data.head(10)

In [None]:
# 한글 이외의 문자 제거
nsmc_data['document'] = nsmc_data['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
nsmc_data.dropna()
print(f"한글 아닌 문자 제거 후 데이터의 개수: {len(nsmc_data)}")
nsmc_data.head(10)

In [None]:
# 불용어 정의 (빈도가 너무 많은 단어는 학습에서 제외 함)
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
# okt 형태소 분석기를 이용해 형태소 단위로 분할 (이때 불용어 제거)
tokens = []
for i, document in enumerate(tqdm(nsmc_data['document'], total=len(nsmc_data))):
    line = []
    line = okt.morphs(document)
    line = [word for word in line if not word in stopwords]
    tokens.append(line)

In [None]:
print(len(tokens))
tokens[:10]

In [None]:
# gensim 학습
word2vec_100 = gensim.models.Word2Vec(sentences=tokens, size=100, window=5, min_count=5)

In [None]:
words = list(word2vec_100.wv.vocab)
len(words), words[:100]

In [None]:
similar = word2vec_100.wv.most_similar("영화")
similar

In [None]:
similar = word2vec_100.wv.most_similar("최민수")
similar

In [None]:
similar = word2vec_100.wv.most_similar("장동건")
similar

In [None]:
# 설경구 - 송윤아 + 고소영
result = word2vec_100.wv.most_similar(positive=['고소영', '설경구'], negative=['송윤아'])
result