In [25]:
#suppress warnings
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wordcloud

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#pos_tag
from nltk import pos_tag

In [29]:
import gensim
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [12]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

In [3]:
sentence = "And now for something completely different"

In [14]:
#nltk pos tagger
nltk_tagged = nltk.pos_tag(word_tokenize(sentence))
print(nltk_tagged)

[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]


In [18]:
#spacy pos tagger
nlp = spacy.load('en_core_web_sm')
doc = nlp(sentence)
spacy_tagged = [(word.text, word.tag_, word.pos_) for word in doc]
print(spacy_tagged)

[('And', 'CC', 'CCONJ'), ('now', 'RB', 'ADV'), ('for', 'IN', 'ADP'), ('something', 'NN', 'PRON'), ('completely', 'RB', 'ADV'), ('different', 'JJ', 'ADJ')]


In [26]:
#gensim pos tagger
sentence = "And now for something completely different"
tagged = gensim.models.doc2vec.TaggedDocument(words=word_tokenize(sentence), tags=['SENT_1'])
print(tagged)

TaggedDocument<['And', 'now', 'for', 'something', 'completely', 'different'], ['SENT_1']>


In [30]:
#gensim word2vec
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)

In [34]:
tokens = word_tokenize(sentence)
print(tokens)

['And', 'now', 'for', 'something', 'completely', 'different']


In [38]:
model.train(tokens, total_examples=len(tokens), epochs=1)

(0, 37)

In [42]:
#show the vector for a word
vector = model.wv['computer']  # get numpy vector of a word

In [44]:
print(vector)
print(len(vector))

[-0.00515774 -0.00667028 -0.0077791   0.00831315 -0.00198292 -0.00685696
 -0.0041556   0.00514562 -0.00286997 -0.00375075  0.0016219  -0.0027771
 -0.00158482  0.0010748  -0.00297881  0.00852176  0.00391207 -0.00996176
  0.00626142 -0.00675622  0.00076966  0.00440552 -0.00510486 -0.00211128
  0.00809783 -0.00424503 -0.00763848  0.00926061 -0.00215612 -0.00472081
  0.00857329  0.00428458  0.0043261   0.00928722 -0.00845554  0.00525685
  0.00203994  0.0041895   0.00169839  0.00446543  0.00448759  0.0061063
 -0.00320303 -0.00457706 -0.00042664  0.00253447 -0.00326412  0.00605948
  0.00415534  0.00776685  0.00257002  0.00811904 -0.00138761  0.00808028
  0.0037181  -0.00804967 -0.00393476 -0.0024726   0.00489447 -0.00087241
 -0.00283173  0.00783599  0.00932561 -0.0016154  -0.00516075 -0.00470313
 -0.00484746 -0.00960562  0.00137242 -0.00422615  0.00252744  0.00561612
 -0.00406709 -0.00959937  0.00154715 -0.00670207  0.0024959  -0.00378173
  0.00708048  0.00064041  0.00356198 -0.00273993 -0.0

In [45]:
sims = model.wv.most_similar('computer', topn=10)  # get other similar words

In [46]:
print(sims)

[('system', 0.21617144346237183), ('survey', 0.044689204543828964), ('interface', 0.015203382819890976), ('time', 0.0019510675920173526), ('trees', -0.03284312039613724), ('human', -0.0742427259683609), ('response', -0.09317591041326523), ('graph', -0.09575346857309341), ('eps', -0.10513807088136673), ('user', -0.16911625862121582)]
