<h2>What is Gensim?</h2>

- A Python library for statistical natural language processing tasks

-  Primarily used for training word and document embeddings, such as Word2Vec and Doc2Vec 

In [2]:
pip install gensim

Collecting gensim
  Obtaining dependency information for gensim from https://files.pythonhosted.org/packages/f5/57/f2e6568dbf464a4b270954e5fa3dee4a4054d163a41c0e7bf0a34eb40f0f/gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Obtaining dependency information for scipy<1.14.0,>=1.7.0 from https://files.pythonhosted.org/packages/4a/48/4513a1a5623a23e95f94abd675ed91cfb19989c58e9f6f7d03990f6caf3d/scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata
  Downloading scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     -------------------------------------- 60.6/60.6 kB 799.0 kB/s eta 0:00:00
Collecting smart-open>=1.8.1 (from gensim)
  Obtaining dependency information for smart-open>=1.8.1 from https://files.pythonhosted.org/packages

In [4]:
# Create a Word2Vec Model

from gensim.models import Word2Vec
# Example Data
sentences = [["cat","say","meow"],["dog","say","woof"]]
# Train the model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

In [8]:
# calculate word similarity

similarity = model.wv.similarity("cat","dog")

# find most similar words
similar_words = model.wv.most_similar("cat",topn=5)

# retrieve word vectors
cat_vector = model.wv["cat"]

In [11]:
print("similarity: ", similarity)
print("Five similar words with 'cat':", similar_words)

similarity:  0.17018887
Five similar words with 'cat': [('dog', 0.17018885910511017), ('woof', 0.004503016825765371), ('say', -0.027750344946980476), ('meow', -0.044617101550102234)]


In [6]:
pip install konlpy

Note: you may need to restart the kernel to use updated packages.


In [24]:
# 형태소에 품사를 붙여서 추출 : pos, 형태소만 추출 : morphs,
# 명사 추출 : nouns, 구 추출 :phrases 

from konlpy.tag import Okt, Kkma
okt = Okt()
kkma=Kkma()
text = "자연어 처리는 재미있다.정말로 재밌어서. 눈물이 날려고 하네!라고했다.근데진짜에요"
sentences= kkma.sentences(text)
tokens = okt.morphs(text)
pos = okt.pos(text)

In [15]:
ph=okt.phrases(text)
print(ph)

['자연어', '자연어 처리', '처리']


In [12]:
print(tokens)

['자연어', '처리', '는', '재미있어요', '!']


In [28]:
result =[]
for s in sentences:
    tmp = okt.morphs(s)
    result.append(tmp)
print(result)

[['자연어', '처리', '는', '재미있다', '.'], ['정말로', '재밌어서', '.', '눈물', '이', '날려고', '하네', '!', '라고'], ['했다', '.'], ['근데', '진짜', '에요']]


In [13]:
print(pos)

[('자연어', 'Noun'), ('처리', 'Noun'), ('는', 'Josa'), ('재미있어요', 'Adjective'), ('!', 'Punctuation')]


In [35]:
sentences = ["the sun sets, painting the sky with fiery hues.",
"a gentle breeze rustles the leaves on the ground.", "she smiled, remembering the days spent under the willow tree.",
"the city lights shimmer, reflecting on the river's surface.",
"he whispered secrets, hoping they would never be revealed.",
"the aroma of fresh coffee filled the cozy little cafe.",
"birds chirped merrily, welcoming the dawn of a new day.",
"her laughter echoed, bringing joy to everyone around her.",
"the ancient forest stood silent, guarding its hidden mysteries.",
"the waves crashed rhythmically, a soothing symphony for all."]

word_dict={}
for s in sentences:
    words = s.split()
    for word in words:
        if word not in word_dict:
            word_dict[word] = len(word_dict)

In [36]:
word_dict

{'the': 0,
 'sun': 1,
 'sets,': 2,
 'painting': 3,
 'sky': 4,
 'with': 5,
 'fiery': 6,
 'hues.': 7,
 'a': 8,
 'gentle': 9,
 'breeze': 10,
 'rustles': 11,
 'leaves': 12,
 'on': 13,
 'ground.': 14,
 'she': 15,
 'smiled,': 16,
 'remembering': 17,
 'days': 18,
 'spent': 19,
 'under': 20,
 'willow': 21,
 'tree.': 22,
 'city': 23,
 'lights': 24,
 'shimmer,': 25,
 'reflecting': 26,
 "river's": 27,
 'surface.': 28,
 'he': 29,
 'whispered': 30,
 'secrets,': 31,
 'hoping': 32,
 'they': 33,
 'would': 34,
 'never': 35,
 'be': 36,
 'revealed.': 37,
 'aroma': 38,
 'of': 39,
 'fresh': 40,
 'coffee': 41,
 'filled': 42,
 'cozy': 43,
 'little': 44,
 'cafe.': 45,
 'birds': 46,
 'chirped': 47,
 'merrily,': 48,
 'welcoming': 49,
 'dawn': 50,
 'new': 51,
 'day.': 52,
 'her': 53,
 'laughter': 54,
 'echoed,': 55,
 'bringing': 56,
 'joy': 57,
 'to': 58,
 'everyone': 59,
 'around': 60,
 'her.': 61,
 'ancient': 62,
 'forest': 63,
 'stood': 64,
 'silent,': 65,
 'guarding': 66,
 'its': 67,
 'hidden': 68,
 'mysteri

In [37]:
processed_sentences=[]
for s in sentences:
    words=s.split()
    processed_s = [word_dict[word] for word in words]
    processed_sentences += [processed_s]

In [38]:
processed_sentences

[[0, 1, 2, 3, 0, 4, 5, 6, 7],
 [8, 9, 10, 11, 0, 12, 13, 0, 14],
 [15, 16, 17, 0, 18, 19, 20, 0, 21, 22],
 [0, 23, 24, 25, 26, 13, 0, 27, 28],
 [29, 30, 31, 32, 33, 34, 35, 36, 37],
 [0, 38, 39, 40, 41, 42, 0, 43, 44, 45],
 [46, 47, 48, 49, 0, 50, 39, 8, 51, 52],
 [53, 54, 55, 56, 57, 58, 59, 60, 61],
 [0, 62, 63, 64, 65, 66, 67, 68, 69],
 [0, 70, 71, 72, 8, 73, 74, 75, 76]]