## 1절. 인공지능

## 2절. 자연어처리

## 3절 자연어처리를 위한 텍스트 전처리

## 4절. 워드 임베딩

In [1]:
# 1. TF (Term Frequency)
text = "John likes to watch movies. Mary likes movies too.\
        Mary also likes to watch football games."

In [2]:
words = text.replace('.', '').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [3]:
import numpy as np
word_count = np.unique(words, return_counts=True)
print(word_count)

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
       'to', 'too', 'watch'], dtype='<U8'), array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2], dtype=int64))


In [4]:
word_to_cnt = {}
for word, cnt in zip(*word_count):
    word_to_cnt[word] = cnt

print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


In [5]:
# 2. TDM (Term Document Matrix)
corpus = [
    "John likes to watch movies. Mary likes movies too.",
    "Mary also likes to watch football games."
    ]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [7]:
import pandas as pd

tf_dic_sorted = dict(sorted(tf_dic.items(), key=lambda x: x[1]))
tdm = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
tdm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0,0,0,1,2,1,2,1,1,1
1,1,1,1,0,1,1,0,1,0,1


In [8]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()

tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(), key=lambda item: item[1]))

tfidf_tdm = pd.DataFrame(tfidf_array, columns=tfidf_dic.keys())
tfidf_tdm

Unnamed: 0,john,likes,to,watch,movies,mary,too,also,football,games
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [11]:
# 4. Word2Vec
corpus = ["John likes to watch movies. Mary likes movies too",
          "Mary also likes to watch football games."]

word_list = []
for word in corpus:
    word_list.append(word.replace('.', '').split())

from gensim.models import Word2Vec
model = Word2Vec(word_list, sg=0, vector_size=100, window=3, min_count=1)

print(model.wv.most_similar('likes'))
print(model.wv.similarity('movies', 'games'))

[('John', 0.21617142856121063), ('also', 0.09291722625494003), ('too', 0.027057476341724396), ('football', 0.016134677454829216), ('Mary', -0.010840574279427528), ('to', -0.02775036357343197), ('movies', -0.05234673246741295), ('games', -0.059876296669244766), ('watch', -0.111670583486557)]
0.064089775


In [17]:
model = Word2Vec.load('C:/Projects/python-workspace/NLP/datasets/ko.bin')
print(model.wv.most_similar('인공지능'))

AttributeError: Can't get attribute 'Vocab' on <module 'gensim.models.word2vec' from 'C:\\Users\\ameli\\anaconda3\\envs\\saltlux_deep_lecture\\lib\\site-packages\\gensim\\models\\word2vec.py'>