# Word Embedding

### Embedding Vector 시각화 wevi 
https://ronxin.github.io/wevi/

### Word2vac
- 2013년 구글에서 개발한 Word Embedding 방법
- 최초의 neural embedding model
- 매우 큰 corpus에서 자동 학습
    - 비지도 지도 학습 (자기 지도학습)이라 할 수 있음
    - 많은 데이터를 기반으로 label 값 유추하고 이를 지도학습에 사요
- ex)
    - **이사금**께 충성을 맹세하였다.
    - **왕**께 충성을 맹세하였다.

**WordVec 훈련방식에 따른 구분**
1. CBOW : 주변 단어로 중심 단어를 예측  (LLM의 훈련방식)
2. skip-gram : 중심 단어로 주변 단어를 예측

In [1]:
# !pip install gensim

##### 영어 Word Embedding

- 데이터 취득 및 전처리

In [35]:
import gdown

# url = 'https://drive.google.com/uc?id=1DCgLPJsfyLGZ99lB-aF8EvpKIWSZYgp4'
# output = './data/ted_en.xml'

# gdown.download(url, output)

In [3]:
# lxml : xml 파일을 다루기 위한 라이브러리, etree : xml 파일을 파싱하기 위한 라이브러리
from lxml import etree
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

In [4]:
# xml 데이터 처리
f = open('./data/ted_en.xml', 'r', encoding='UTF8')
xml = etree.parse(f)

# content 태그 하위 텍스트 추출
contents = xml.xpath('//content/text()')
# contents[:5]

# corpus : 
corpus = '\n'.join(contents)
print(len(corpus))

# 정규식을 이용해 () 안에 있는 내용 제거, ()도 포함
corpus = re.sub(r'\([^)]*\)','',corpus) # 괄호로 묶인 내용 제거
print(len(corpus))

24222849
24062319


In [5]:
# 데이터 전처리 (토큰화/대소문자 정규화/불용어 처리)

sentences = sent_tokenize(corpus)

preprocessed_sentences = []
en_stopwords = stopwords.words('english')

for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub('[^a-z0-9]', ' ', sentence) # [^] : not 의미, a-z0-9를 제외한 문자는 공백으로 처리
    tokens = word_tokenize(sentence)
    tokens = [token for token in tokens if token not in en_stopwords]
    preprocessed_sentences.append(tokens)

preprocessed_sentences[:5]

[['two', 'reasons', 'companies', 'fail', 'new'],
 ['real',
  'real',
  'solution',
  'quality',
  'growth',
  'figuring',
  'balance',
  'two',
  'activities',
  'exploration',
  'exploitation'],
 ['necessary', 'much', 'good', 'thing'],
 ['consider', 'facit'],
 ['actually', 'old', 'enough', 'remember']]

- Embedding 모델 학습

In [6]:
from gensim.models import Word2Vec

model = Word2Vec(
    sentences=preprocessed_sentences, # 전처리 완료한 corpus 전달
    vector_size=100, # 임베딩 벡터의 차원 (열의 갯수)
    sg=0, # 학습 알고리즘 선택 -> 0 : CBOW, 1 : Skip-gram
    window=5, # 주변단어로 사용될 단어의 갯수 -> 앞뒤로 5개 사용
    min_count=5 # 최소 단어 빈도수 -> 5 미만시 제거
)

model.wv.vectors.shape

(21462, 100)

In [7]:
import pandas as pd

# 단어와 벡터값을 데이터프레임으로 변환
pd.DataFrame(model.wv.vectors, index=model.wv.index_to_key).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
one,-0.170976,-0.352684,-1.109782,0.719097,-0.056872,-0.834759,0.155871,0.86072,-2.413358,-0.679618,...,1.186917,0.927451,-0.60366,0.320638,1.040395,-1.827408,-0.679039,-0.16728,0.545405,1.1035
people,-1.743455,0.821341,-0.414869,0.994208,-0.46447,-1.744901,-0.087329,1.340298,-0.988168,-2.138733,...,0.793145,0.775812,-1.617177,-0.238939,-0.700463,0.278039,-0.328139,-1.022362,-1.243265,0.988217
like,-0.35188,0.155944,-1.06229,-1.590005,0.033918,-0.171667,0.705476,0.608569,-1.799561,1.103314,...,-0.516862,0.768208,-0.432847,0.00868,-0.335269,0.391648,0.549824,-0.15926,0.498485,-0.46753
know,-0.663044,-0.103237,-0.110547,0.08098,0.302982,0.194516,-0.484664,0.197144,-0.966257,-0.89251,...,0.333185,-0.00113,-0.127065,-0.071313,-0.530158,0.252938,0.204229,-0.82167,0.593109,-0.193557
going,-1.175145,0.670657,-0.436101,-0.210429,1.27169,0.32562,-0.871675,1.472317,-0.639476,-0.787817,...,0.83356,-0.8675,0.096362,1.681017,-0.179273,-0.251886,0.221924,-0.040647,-1.015305,0.550194
think,-0.371681,-0.225626,0.867421,-0.487055,-0.114237,-1.230629,0.113927,0.007452,-1.126404,-1.171061,...,0.639116,1.374573,-0.547842,0.152309,0.334496,-0.446803,0.001394,-0.74343,-0.006959,-0.117203
see,0.060864,-0.02208,0.499764,-1.225166,-1.059776,-0.92979,0.044639,0.464147,-2.076057,0.952108,...,-0.597181,0.877456,0.508122,0.78973,0.522987,0.06814,0.653242,-1.047399,0.407329,0.221998
would,0.296511,0.154338,0.447735,-0.749754,1.373878,0.397202,-0.376988,0.483165,-1.139663,-0.341627,...,1.192409,-0.286457,-1.085607,1.479,-0.507268,0.910902,-1.225696,-0.850046,-0.978325,-0.795954
really,-2.498835,-0.742601,-0.332148,0.181163,0.088897,-0.079612,1.333248,1.631978,-0.275459,-0.56274,...,0.550215,-0.231831,0.085143,0.041186,0.738517,-0.300301,-0.252834,-1.52476,-0.422742,-0.162458
get,-2.349108,-1.466434,-1.017342,-0.998127,-0.129758,-1.123341,-0.46842,2.081601,-0.113608,-1.603799,...,-0.205629,-0.3924,-0.202999,0.396128,0.703867,0.473035,0.10559,-0.056075,-0.839903,0.1675


In [8]:
# 학습된 모델 저장장
model.wv.save_word2vec_format('./data/ted_en_w2v')

In [9]:
# 임베딩 모델 로드
from gensim.models import KeyedVectors

load_model = KeyedVectors.load_word2vec_format('./data/ted_en_w2v')

- 유사도 계산

In [10]:
model.wv.most_similar('man')
# model.wv.most_similar('abracadabra') # 없는 단어로 검색시 KeyError 발생

[('woman', 0.894888162612915),
 ('lady', 0.8059537410736084),
 ('daughter', 0.8046254515647888),
 ('girl', 0.7839692831039429),
 ('father', 0.768945574760437),
 ('son', 0.7668907046318054),
 ('sister', 0.7639594078063965),
 ('boy', 0.760553777217865),
 ('grandfather', 0.7535830736160278),
 ('brother', 0.7494614720344543)]

In [None]:
load_model.most_similar('man')  # Word2Vector = KeyedVectors

[('woman', 0.894888162612915),
 ('lady', 0.8059537410736084),
 ('daughter', 0.8046254515647888),
 ('girl', 0.7839692831039429),
 ('father', 0.768945574760437),
 ('son', 0.7668907046318054),
 ('sister', 0.7639594078063965),
 ('boy', 0.760553777217865),
 ('grandfather', 0.7535830736160278),
 ('brother', 0.7494614720344543)]

In [None]:
# 학습 과정에서 결과가 조금씩 달라질수 있음
model.wv.similarity('man', 'girl')


0.7839694

In [13]:
model.wv['man']

array([ 0.7745835 , -0.22766349,  1.0062977 ,  1.8761008 , -0.91865075,
       -0.15480942, -0.8077635 ,  1.481545  , -0.43826073, -0.9032544 ,
        0.2819144 ,  0.75686204, -0.08193346,  0.520204  ,  1.178618  ,
       -0.55458224,  0.816066  ,  0.29596296, -0.9180915 , -0.01825986,
        0.92368734,  0.84741896, -0.04565032, -0.49587068,  0.4323257 ,
        0.26310664, -0.9674338 , -0.6551638 , -0.27444005,  0.9216952 ,
       -1.4798337 , -0.9679094 , -0.02177182, -1.6081411 , -0.2547376 ,
        1.1141034 , -0.34371814, -0.58212453,  0.43535814, -0.23926155,
        1.241017  ,  0.08917684,  0.7958752 ,  0.4483123 ,  1.9125143 ,
       -0.04252987, -0.96859765,  0.4817343 ,  0.25932178, -0.35227594,
        0.6278037 , -0.2608963 ,  0.1191903 , -0.9320729 ,  0.27536732,
        0.595481  ,  0.37365815,  0.3440508 , -0.07001403, -0.18483338,
        0.0405275 , -0.760845  , -1.8638783 ,  1.1551318 , -1.0451267 ,
        0.91317815, -0.15356694,  0.2598233 ,  0.6875841 ,  1.57

- 임베딩 시각화

https://projector.tensorflow.org/

- embedding vector(tensor) 파일 (.tsv)
- metadat 파일 (.tsv)

In [15]:
!python -m gensim.scripts.word2vec2tensor --input ./data/ted_en_w2v --output ./data/ted_en_w2v

2025-02-20 12:06:24,164 - word2vec2tensor - INFO - running c:\Users\ljh10\anaconda3\envs\pystudy_env\Lib\site-packages\gensim\scripts\word2vec2tensor.py --input ./data/ted_en_w2v --output ./data/ted_en_w2v
2025-02-20 12:06:24,164 - keyedvectors - INFO - loading projection weights from ./data/ted_en_w2v
2025-02-20 12:06:25,706 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (21462, 100) matrix of type float32 from ./data/ted_en_w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-02-20T12:06:25.390534', 'gensim': '4.3.3', 'python': '3.12.8 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:48:34) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'load_word2vec_format'}
2025-02-20 12:06:26,739 - word2vec2tensor - INFO - 2D tensor file saved to ./data/ted_en_w2v_tensor.tsv
2025-02-20 12:06:26,739 - word2vec2tensor - INFO - Tensor metadata file saved to ./data/ted_en_w2v_metadata.tsv
2025-02-20 12:06:26,739 - word2vec2tensor - INFO - f

### 한국어 Word Embedding
- NSMC (Naver Sentiment Movie Corpus)

In [17]:
import numpy as np
import pandas as pd
import urllib.request
from konlpy.tag import Okt # 형태소 분석기

In [None]:
# 데이터 다운로드 
# urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt",
#                            filename='./data/naver_movie_ratings.txt')

('./data/naver_movie_ratings.txt', <http.client.HTTPMessage at 0x2968cf7b3e0>)

In [None]:
# 데이터 프레임 생성
ratings_df = pd.read_csv('./data/naver_movie_ratings.txt', sep='\t')

In [None]:
# 결측치 확인 및 처리 (제거)
display(ratings_df.isnull().sum())

ratings_df = ratings_df.dropna(how='any') # Null 값이 존재하는 행 제거

id          0
document    0
label       0
dtype: int64

In [22]:
ratings_df['document']

0                                       어릴때보고 지금다시봐도 재밌어요ㅋㅋ
1         디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...
2                      폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.
3         와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...
4                               안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.
                                ...                        
199995                                       포켓 몬스터 짜가 ㅡㅡ;;
199996                                                쓰.레.기
199997                    완전 사이코영화. 마지막은 더욱더 이 영화의질을 떨어트린다.
199998                  왜난 재미없었지 ㅠㅠ 라따뚜이 보고나서 스머프 봐서 그런가 ㅋㅋ
199999                                      포풍저그가나가신다영차영차영차
Name: document, Length: 199992, dtype: object

In [None]:
# 한글이 아닌 데이터 제거
ratings_df['document'] =ratings_df['document'].replace(r'[^0-9가-힣ㄱ-ㅎㅏ-ㅣ]', '', regex=True) # 한글이 아닌 데이터 제거, 자음 혹은 모음으로만 이뤄진 단어들은 남겨둠
# regex : 정규식 사용 여부

In [25]:
# 전처리
from tqdm import tqdm
okt = Okt()
ko_stopwards = ['은','는','이','가','을','를','와','과','들','도','부터','까지','에','나','너','그','걔','얘']

preprocessed_data = []

for sentence in tqdm(ratings_df['document']): # tqdm : 진행상황을 시각화 해주는 라이브러리
    tokens = okt.morphs(sentence, stem=True)    #morphs : 형태소 분석기, stem : 어간 추출
    tokens = [token for token in tokens if token not in ko_stopwards]
    preprocessed_data.append(tokens)
    
preprocessed_data[:5]

100%|██████████| 199992/199992 [1:07:18<00:00, 49.52it/s]


[['어리다', '때', '보고', '지금', '다시', '보다', '재밌다', 'ㅋㅋ'],
 ['디자인',
  '배우다',
  '학생',
  '으로',
  '외국',
  '디자이너',
  '일군',
  '전통',
  '통해',
  '발전',
  '하다',
  '문화',
  '산업',
  '부럽다',
  '사실',
  '우리나라',
  '에서도',
  '어려운',
  '시절',
  '끝',
  '열정',
  '지키다',
  '노라노',
  '같다',
  '전통',
  '있다',
  '저',
  '같다',
  '사람',
  '꿈',
  '꾸다',
  '이루다',
  '갈수',
  '있다',
  '감사하다'],
 ['폴리스스토리', '시리즈', '1', '뉴', '버리다', '없다', '최고'],
 ['오다',
  '연기',
  '진짜',
  '개',
  '쩔다',
  '지루하다',
  '생각',
  '하다',
  '몰입',
  '하다',
  '보다',
  '다그',
  '래',
  '이렇다',
  '진짜',
  '영화',
  '지'],
 ['안개', '자욱하다', '밤하늘', '뜨다', '초승달', '같다', '영화']]

In [29]:
model = Word2Vec(
    sentences=preprocessed_data,
    vector_size=100,
    window=5,
    min_count=5,
    sg=0 # CBOW
)

model.wv.vectors.shape

(17889, 100)

In [31]:
model.wv.similarity('김혜수','박서준')

0.77569747

In [32]:
model.wv.save_word2vec_format('./data/naver_movie_ratings_w2v')

In [33]:
!python -m gensim.scripts.word2vec2tensor --input ./data/naver_movie_ratings_w2v --output ./data/naver_movie_ratings_w2v

2025-02-20 14:03:56,672 - word2vec2tensor - INFO - running c:\Users\ljh10\anaconda3\envs\pystudy_env\Lib\site-packages\gensim\scripts\word2vec2tensor.py --input ./data/naver_movie_ratings_w2v --output ./data/naver_movie_ratings_w2v
2025-02-20 14:03:56,672 - keyedvectors - INFO - loading projection weights from ./data/naver_movie_ratings_w2v
2025-02-20 14:03:57,559 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (17889, 100) matrix of type float32 from ./data/naver_movie_ratings_w2v', 'binary': False, 'encoding': 'utf8', 'datetime': '2025-02-20T14:03:57.383540', 'gensim': '4.3.3', 'python': '3.12.8 | packaged by Anaconda, Inc. | (main, Dec 11 2024, 16:48:34) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'load_word2vec_format'}
2025-02-20 14:03:58,165 - word2vec2tensor - INFO - 2D tensor file saved to ./data/naver_movie_ratings_w2v_tensor.tsv
2025-02-20 14:03:58,165 - word2vec2tensor - INFO - Tensor metadata file saved to ./data/naver_movie

- 사전 훈련된 임베딩

In [36]:
url ='https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj'
output='./data/GoogleNews_vecs.bin.gz'

gdown.download(url, output)

Downloading...
From (original): https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj
From (redirected): https://drive.google.com/uc?id=11MWLNUBLOJWpJePTbOJwCtcgEryPGKGj&confirm=t&uuid=00b2a987-cf65-4a3c-bb25-449a69ddb0a6
To: c:\SKNetworks_AI\ai-learning\AI-learning\NLP\03_word_embedding\data\GoogleNews_vecs.bin.gz
100%|██████████| 1.65G/1.65G [02:28<00:00, 11.1MB/s]


'./data/GoogleNews_vecs.bin.gz'

In [44]:
google_news_wv = KeyedVectors.load_word2vec_format('./data/GoogleNews_vecs.bin.gz', binary=True)
google_news_wv.vectors.shape

(3000000, 300)

In [45]:
google_news_wv.similarity('king','man')

0.22942673

In [None]:
google_news_wv.most_similar('king', topn=5) # topn : 상위 n개의 유사도를 출력

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

In [46]:
google_news_wv.n_similarity(['king','queen'],['man','woman'])

0.24791393

In [None]:
google_news_wv.similar_by_word('king', topn=5) # similar_by_word : 단어를 입력받아 유사도를 출력

[('kings', 0.7138045430183411),
 ('queen', 0.6510956883430481),
 ('monarch', 0.6413194537162781),
 ('crown_prince', 0.6204220056533813),
 ('prince', 0.6159993410110474)]

In [50]:
google_news_wv.has_index_for('king') # 단어가 존재하는지 확인

True