In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [2]:
paragraph="""NLTK is a leading "platform" for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.
Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation, NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike. NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project.
NLTK has been called “a wonderful tool for teaching, and working in, computational linguistics using Python,” and “an amazing library to play with natural language."""

In [3]:
wordnet=WordNetLemmatizer()
sentenses=nltk.sent_tokenize(paragraph)

In [4]:
sentenses

['NLTK is a leading "platform" for building Python programs to work with human language data.',
 'It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.',
 'Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation, NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike.',
 'NLTK is available for Windows, Mac OS X, and Linux.',
 'Best of all, NLTK is a free, open source, community-driven project.',
 'NLTK has been called “a wonderful tool for teaching, and working in, computational linguistics using Python,” and “an amazing library to play with natural language.']

In [5]:
corpus=[]
for i in range(len(sentenses)):
    review=re.sub('[^a-zA-Z]',' ',sentenses[i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    review=' '.join(review)
    corpus.append(review)

In [6]:
review

'nltk called wonderful tool teaching working computational linguistics using python amazing library play natural language'

In [7]:
# creating the Bag of words model
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus)

In [8]:
X.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1],
       [0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1,
        0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
        0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0

In [9]:
# creating the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer()
X=cv.fit_transform(corpus)

In [10]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.34085203, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.34085203, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.34085203, 0.        , 0.        ,
        0.        , 0.        , 0.27950354, 0.34085203, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.1746276 , 0.        , 0.        ,
        0.34085203, 0.        , 0.        , 0.        , 0.34085203,
        0.        , 0.        , 0.        , 0.27950354, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [11]:
# download gensim library
!pip install gensim



In [12]:
# creating word2vec model
from gensim.models import Word2Vec

In [13]:
paragraph="""NLTK is a leading "platform" for building Python programs to work with human language data. It provides easy-to-use interfaces to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries, and an active discussion forum.
Thanks to a hands-on guide introducing programming fundamentals alongside topics in computational linguistics, plus comprehensive API documentation, NLTK is suitable for linguists, engineers, students, educators, researchers, and industry users alike. NLTK is available for Windows, Mac OS X, and Linux. Best of all, NLTK is a free, open source, community-driven project.
NLTK has been called “a wonderful tool for teaching, and working in, computational linguistics using Python,” and “an amazing library to play with natural language."""

In [14]:
corpus=[]
for i in range(len(sentenses)):
    review=re.sub('[^a-zA-Z]',' ',sentenses[i])
    review=review.lower()
    review=review.split()
    review=[wordnet.lemmatize(word) for word in review if not word in set(stopwords.words('english'))]
    #review=' '.join(review)
    corpus.append(review)

In [15]:
corpus

[['nltk',
  'leading',
  'platform',
  'building',
  'python',
  'program',
  'work',
  'human',
  'language',
  'data'],
 ['provides',
  'easy',
  'use',
  'interface',
  'corpus',
  'lexical',
  'resource',
  'wordnet',
  'along',
  'suite',
  'text',
  'processing',
  'library',
  'classification',
  'tokenization',
  'stemming',
  'tagging',
  'parsing',
  'semantic',
  'reasoning',
  'wrapper',
  'industrial',
  'strength',
  'nlp',
  'library',
  'active',
  'discussion',
  'forum'],
 ['thanks',
  'hand',
  'guide',
  'introducing',
  'programming',
  'fundamental',
  'alongside',
  'topic',
  'computational',
  'linguistics',
  'plus',
  'comprehensive',
  'api',
  'documentation',
  'nltk',
  'suitable',
  'linguist',
  'engineer',
  'student',
  'educator',
  'researcher',
  'industry',
  'user',
  'alike'],
 ['nltk', 'available', 'window', 'mac', 'o', 'x', 'linux'],
 ['best', 'nltk', 'free', 'open', 'source', 'community', 'driven', 'project'],
 ['nltk',
  'called',
  'wonderf

In [16]:
word2vec=Word2Vec(corpus,min_count=2)

In [17]:
len(word2vec.wv)

6

In [18]:
word2vec.wv['python']

array([-8.7274835e-03,  2.1301603e-03, -8.7354420e-04, -9.3190884e-03,
       -9.4281435e-03, -1.4107180e-03,  4.4324086e-03,  3.7040710e-03,
       -6.4986944e-03, -6.8730689e-03, -4.9994136e-03, -2.2868442e-03,
       -7.2502876e-03, -9.6033188e-03, -2.7436304e-03, -8.3628418e-03,
       -6.0388758e-03, -5.6709289e-03, -2.3441387e-03, -1.7069983e-03,
       -8.9569995e-03, -7.3519943e-04,  8.1525063e-03,  7.6904297e-03,
       -7.2061159e-03, -3.6668323e-03,  3.1185509e-03, -9.5707225e-03,
        1.4764380e-03,  6.5244650e-03,  5.7464195e-03, -8.7630628e-03,
       -4.5171450e-03, -8.1401607e-03,  4.5955181e-05,  9.2636319e-03,
        5.9733056e-03,  5.0673080e-03,  5.0610616e-03, -3.2429171e-03,
        9.5521836e-03, -7.3564244e-03, -7.2703888e-03, -2.2653891e-03,
       -7.7856064e-04, -3.2161046e-03, -5.9258699e-04,  7.4888230e-03,
       -6.9751980e-04, -1.6249418e-03,  2.7443981e-03, -8.3591007e-03,
        7.8558037e-03,  8.5361032e-03, -9.5840879e-03,  2.4462652e-03,
      

In [19]:
word2vec.wv.most_similar('python')

[('language', 0.13887985050678253),
 ('computational', 0.13149003684520721),
 ('linguistics', 0.0640898123383522),
 ('library', 0.009391186758875847),
 ('nltk', -0.05987628176808357)]