## 임베딩

- 빈도수 계산 
- TDM: TF를 행렬로 만든 것, 사전을 이용한 단순빈도

In [5]:
text = "John likes to watch movies. Mary likes movies too. Mary also likes to watch football games."
# 구두점(.) 제거 후 공백 기준 분리
words = text.replace('.', '').split()

print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [8]:
import numpy as np

# 유일한 값과 개수를 셈
word_count = np.unique(words, return_counts=True)


# 단어-개수 딕셔너리로 변환
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt

print(word_to_cnt)

{np.str_('John'): np.int64(1), np.str_('Mary'): np.int64(2), np.str_('also'): np.int64(1), np.str_('football'): np.int64(1), np.str_('games'): np.int64(1), np.str_('likes'): np.int64(3), np.str_('movies'): np.int64(2), np.str_('to'): np.int64(2), np.str_('too'): np.int64(1), np.str_('watch'): np.int64(2)}


In [9]:
import numpy as np

text = "John likes to watch movies. Mary likes movies too. Mary also likes to watch football games."
words = text.replace('.', '').split()

# NumPy 고유값, 개수 반환
word_count = np.unique(words, return_counts=True)

# dict로 변환 (파이썬 기본 타입으로 캐스팅)
word_to_cnt = {str(word): int(cnt) for word, cnt in zip(*word_count)}

print(word_to_cnt)


{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# 문서 집합
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]

# CountVectorizer 객체 생성
vector = CountVectorizer()

# 문서 → 행렬 변환 (문서-단어 행렬)
tdm_array = vector.fit_transform(corpus).toarray()

# 단어 사전 (단어와 인덱스 매핑)
tf_dic = vector.vocabulary_

print("문서-단어 행렬 (TDM):")
print(tdm_array)

print("\n단어 인덱스 사전:")
print(tf_dic)


문서-단어 행렬 (TDM):
[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]

단어 인덱스 사전:
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [11]:
import pandas as pd
tf_dic_sorted=dict(sorted(tf_dic.items(), key=lambda item: item[1]))
tf_dic_sorted

{'also': 0,
 'football': 1,
 'games': 2,
 'john': 3,
 'likes': 4,
 'mary': 5,
 'movies': 6,
 'to': 7,
 'too': 8,
 'watch': 9}

In [13]:
df=pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
df

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


# TF-IDF 

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec=TfidfVectorizer()
tfidf_array=tfidf_vec.fit_transform(corpus).toarray()
tfidf_array
#tfid_dic=tfidf_vec.vocabulary_

array([[0.        , 0.        , 0.        , 0.32369906, 0.46062909,
        0.23031454, 0.64739811, 0.23031454, 0.32369906, 0.23031454],
       [0.44610081, 0.44610081, 0.44610081, 0.        , 0.3174044 ,
        0.3174044 , 0.        , 0.3174044 , 0.        , 0.3174044 ]])

In [18]:
tfidf_dic=tfidf_vec.vocabulary_
#tfidf_dic
tfidf_dic_sorted=dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))
tfidf_dtm=pd.DataFrame(tfidf_array, columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [None]:
from gensim.models import Word2Vec  #단어를 벡터 공간에 임베딩하여 단어 간 의미적 유사도를 학습하는 모델

In [2]:
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
]
word_list = []


In [3]:
for word in corpus:
    word_list.append(word.replace('.','').split())

word_list



[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [None]:
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

# sg=0 → CBOW방식 1이면 Skip-gram 방식
#window=3 → 중심 단어 기준 양쪽 3개 단어까지 문맥으로 사용
#min_count=1 → 최소 등장 횟수가 1 이상인 단어만 학습

[('John', 0.17164471745491028),
 ('also', 0.06594578176736832),
 ('Mary', 0.008838453330099583),
 ('watch', -0.06765829026699066),
 ('games', -0.08544928580522537),
 ('football', -0.08948154747486115),
 ('too', -0.11860241740942001),
 ('to', -0.13643866777420044)]

In [5]:
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)
model.wv.most_similar(positive=['likes', 'movies'])


[('too', 0.16428987681865692),
 ('John', 0.1331648826599121),
 ('football', 0.11773452907800674),
 ('to', 0.10346361249685287),
 ('also', 0.06549220532178879),
 ('games', 0.0030605813954025507),
 ('Mary', -0.02506251260638237),
 ('watch', -0.0909314975142479)]

In [None]:
model.wv.most_similar('likes')   # 'likes'와 유사한 단어 찾기
# '유사한' : 자주 같은 주변 단어와 함께 나타난다면, 두 단어 벡터는 비슷한 방향을 갖습니다.

[('John', 0.21617145836353302),
 ('also', 0.09291718155145645),
 ('too', 0.027057474479079247),
 ('football', 0.01613466814160347),
 ('Mary', -0.01084057241678238),
 ('to', -0.02775036357343197),
 ('movies', -0.052346743643283844),
 ('games', -0.05987629294395447),
 ('watch', -0.111670583486557)]