In [4]:
import sklearn
import pandas as pd
import numpy as np
import nltk 
import re 
import string 
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from wordcloud import wordcloud
from nltk.corpus import stopwords

In [12]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [18]:
text = ["""
The COVID-19 pandemic in India is a part of the worldwide pandemic of coronavirus disease 2019 (COVID-19) caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). As of 27 September 2021, according to official figures, India has the second-highest number of confirmed cases in the world (after the United States of America) with 33,678,786 reported cases of COVID-19 infection and the third-highest number of COVID-19 deaths (after the United States and Brazil) at 482,551[4] deaths.[6][7][8] However these figures exhibit severe under-reporting.
"""]

In [19]:
def cleaning_reviews(text_):
    message = list()
    for i in text_:
        text = i.lower()
        pattern = re.compile('http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:[0-9a-fA-F][0-9a-fA-F]))+')
        text = pattern.sub("", text)
        tokens = nltk.word_tokenize(text)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_word = set(stopwords.words('english'))
        stop_word.discard("not")
        words = [nltk.WordNetLemmatizer().lemmatize(w) for  w in words if not w in stop_word]
        words = ' '.join(words)
        message.append(words)

    return(message)

In [20]:
corpus = cleaning_reviews(text)

In [21]:
print(corpus)

['pandemic india part worldwide pandemic coronavirus disease caused severe acute respiratory syndrome coronavirus september according official figure india secondhighest number confirmed case world united state america reported case infection thirdhighest number death united state brazil death however figure exhibit severe underreporting']


In [27]:
sentences = nltk.sent_tokenize(corpus[0])
sentences = [nltk.word_tokenize(sent) for sent in sentences]

In [36]:
sentences[0].__len__()

41

In [42]:
sentences

[['pandemic',
  'india',
  'part',
  'worldwide',
  'pandemic',
  'coronavirus',
  'disease',
  'caused',
  'severe',
  'acute',
  'respiratory',
  'syndrome',
  'coronavirus',
  'september',
  'according',
  'official',
  'figure',
  'india',
  'secondhighest',
  'number',
  'confirmed',
  'case',
  'world',
  'united',
  'state',
  'america',
  'reported',
  'case',
  'infection',
  'thirdhighest',
  'number',
  'death',
  'united',
  'state',
  'brazil',
  'death',
  'however',
  'figure',
  'exhibit',
  'severe',
  'underreporting']]

In [43]:
from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count=1)
words = model.wv.vocab.keys()
vectors = model.wv.vectors

In [40]:
words.__len__()

31

In [41]:
words

dict_keys(['pandemic', 'india', 'part', 'worldwide', 'coronavirus', 'disease', 'caused', 'severe', 'acute', 'respiratory', 'syndrome', 'september', 'according', 'official', 'figure', 'secondhighest', 'number', 'confirmed', 'case', 'world', 'united', 'state', 'america', 'reported', 'infection', 'thirdhighest', 'death', 'brazil', 'however', 'exhibit', 'underreporting'])

In [44]:
vectors

array([[ 0.00029162, -0.00431387, -0.00386379, ..., -0.00307314,
         0.00012   ,  0.00017241],
       [-0.00474059,  0.00111386, -0.0005662 , ..., -0.00498696,
        -0.0038059 ,  0.00097773],
       [ 0.00141138, -0.00149514, -0.00486016, ..., -0.00397663,
         0.00144558,  0.00030017],
       ...,
       [-0.00090823,  0.00051952,  0.00060335, ..., -0.0049471 ,
        -0.00183043, -0.00102894],
       [ 0.00152243, -0.00483561, -0.00400419, ...,  0.00378704,
         0.00402718, -0.00214216],
       [-0.00238373,  0.00303872,  0.00117584, ...,  0.00349537,
        -0.00368543,  0.00209496]], dtype=float32)

In [48]:
vectr = model.wv.get_vector('india')
print(vectr)

[-4.7405888e-03  1.1138618e-03 -5.6620390e-04  5.5836065e-04
 -4.3008113e-03  9.4552222e-04  3.1345761e-03  3.6477149e-03
  3.2424470e-03 -5.8257848e-04  3.9265594e-03  4.0094461e-03
  3.1268983e-03  4.8886673e-03  4.9672914e-03  4.4643693e-04
  4.7326521e-03  1.5659517e-03 -1.1805432e-03 -2.4403133e-03
 -6.2186667e-04 -3.8534573e-03 -2.2799205e-03  1.1598539e-03
 -4.0759079e-04  4.0315795e-03 -2.6867029e-03 -2.1691776e-03
  3.3162953e-03 -2.3784845e-03 -3.8422679e-03 -4.7792997e-03
  2.2787470e-03 -4.9467529e-03  1.8445917e-03  1.1299953e-03
  4.4991788e-03 -4.4074743e-03 -4.9525206e-03 -4.9991822e-03
  2.8140480e-03 -1.1344521e-03 -1.5921699e-03 -3.6742042e-03
 -1.5756101e-03 -2.4511926e-03  2.6730534e-03  1.6384154e-03
 -1.4793426e-03 -7.9234963e-04 -3.0904674e-05  9.2127977e-04
 -2.5491611e-04 -4.2819008e-04 -3.4646892e-03 -8.7306730e-04
 -4.8486409e-03 -5.5409194e-04  2.0247619e-03 -2.2199270e-03
 -4.7880136e-03  1.0117370e-04  1.6625586e-03  1.9491636e-03
 -3.6166441e-03  4.55474

In [49]:
print(model.wv.most_similar("coronavirus"))

[('confirmed', 0.16931085288524628), ('underreporting', 0.14003239572048187), ('september', 0.13749036192893982), ('infection', 0.1279270052909851), ('part', 0.12043951451778412), ('number', 0.12010246515274048), ('acute', 0.11079050600528717), ('united', 0.0954793319106102), ('case', 0.09165699779987335), ('caused', 0.07566423714160919)]


In [50]:
print(model.wv.similarity('coronavirus', 'india'))

-0.15484956


In [51]:
print(model.wv.similarity('coronavirus', 'worldwide'))

-0.03568394


In [52]:
print(model.wv.similarity('coronavirus', 'september'))

0.13749036


In [None]:
model.save_weights('my_model_weights.h5')
model.save_weights('my_model_weights.pkl')
model.load_weights('path of the model')
model.predict()