<a href="https://colab.research.google.com/github/manishiitg/ML_Experiments/blob/master/recruit/word2vec_recruit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Word2Vec is a very interesting model to learn and gives good understanding into the world of word embeddings.

Below will experiement with word2vec model and some data related to resume data of users. 



In [0]:
!pip install Flask
!pip install flask_pymongo



In [0]:
import gensim, logging
from flask import Flask
from flask_pymongo import PyMongo
from bs4 import BeautifulSoup
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from nltk.tokenize import ToktokTokenizer
from nltk import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
app = Flask(__name__)
app.config["MONGO_URI"] = "" #removing database url as its private
mongo = PyMongo(app)
toktok = ToktokTokenizer()


def cleanMe(html):
    # create a new bs4 object from the html data loaded
    
    # print(html)
    soup = BeautifulSoup(html, "html.parser")
    # remove all javascript and stylesheet code
    for script in soup(["script", "style"]):
        script.extract()
    # get text
    text = soup.get_text(separator=' ')

    # break into lines and remove leading and trailing space on each
    # lines = (line.strip() for line in text.splitlines())
    lines = text.splitlines()
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    # text = '\n'.join(chunk for chunk in chunks if chunk)
    chunks = filter(None, chunks)
    return " ".join(chunks)

ret = mongo.db.naukri_candidates.find({})

import os
if not os.path.exists("word_embedding_recruit"):
    os.makedirs("word_embedding_recruit")

for row in ret:
    education = row["education"]
    workExperiance = row["workExperiance"]
    summary = row['summary']
    key_skills = row["key_skills"]
    projects = row["projects"]
    # extra_data = row["extra_data"]

    summary_str = summary.splitlines()

    education_string = []
    for edu in education:
        education_string.append(edu["type"]  + " " + edu["degree"] + " " + edu["specific"] + " " + edu["university"] + " " + edu["year"])

    work_exp_string = []
    for work in workExperiance:
        work_exp_string.append(work["company_name"]  + " " + work["designation"] + " " + work["date"] + " " + cleanMe(work["desc"]))

    key_skills_str =  " ".join(key_skills.split(",")).replace("Key Skills:","")

    project_str = []
    for project in projects:
        project_str.append(cleanMe(project))

    all_lines = []
    all_lines.extend(summary_str)
    all_lines.extend(education_string)
    all_lines.extend(work_exp_string)
    all_lines.append(key_skills_str)
    all_lines.extend(project_str)

    # print(row)

    with open(os.path.join("word_embedding_recruit",  row["uname"] + ".txt"), 'w') as the_file:
        for line in all_lines:
          tokens = [toktok.tokenize(sent) for sent in sent_tokenize(line)]
          for sent in tokens:
            the_file.write(" ".join(sent) + '\n')

  ' Beautiful Soup.' % markup)


In [0]:
import os 
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield gensim.utils.simple_preprocess(line)
 
sentences = MySentences('word_embedding_recruit') # a memory-friendly iterator
model = gensim.models.Word2Vec(sentences, size=300, window=5, min_count=5, workers=4)

2020-01-01 09:32:31,403 : INFO : collecting all words and their counts
2020-01-01 09:32:31,450 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-01-01 09:32:31,816 : INFO : PROGRESS: at sentence #10000, processed 210370 words, keeping 13295 word types
2020-01-01 09:32:32,197 : INFO : PROGRESS: at sentence #20000, processed 422874 words, keeping 18975 word types
2020-01-01 09:32:32,583 : INFO : PROGRESS: at sentence #30000, processed 642863 words, keeping 23698 word types
2020-01-01 09:32:32,960 : INFO : PROGRESS: at sentence #40000, processed 854673 words, keeping 27125 word types
2020-01-01 09:32:33,311 : INFO : PROGRESS: at sentence #50000, processed 1057974 words, keeping 30205 word types
2020-01-01 09:32:33,719 : INFO : PROGRESS: at sentence #60000, processed 1281487 words, keeping 33232 word types
2020-01-01 09:32:34,102 : INFO : PROGRESS: at sentence #70000, processed 1492662 words, keeping 35757 word types
2020-01-01 09:32:34,469 : INFO : PROGRESS: 

In [0]:
model.wv.save_word2vec_format("word2vec.bin", binary=True)

2020-01-01 09:41:31,405 : INFO : storing 40141x300 projection weights into word2vec.bin
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp word2vec.model /content/drive/My\ Drive/Colab\ Notebooks

In [0]:
print(model)

# print(model.wv.vocab)

# model.wv.most_similar(positive=["PHP"])

print("printing all keys")
print(model.wv.vocab.keys())

print("printing similar objects")
print(model.wv.most_similar(positive=['excel'], topn=1))

print(model.wv.most_similar(positive=['php']))

print(model.wv.most_similar(positive=['delhi']))

print(model.wv.most_similar(positive=['seo']))

print(model.wv.most_similar(positive=['javascript']))

Word2Vec(vocab=40141, size=300, alpha=0.025)
printing all keys
printing similar objects
[('vba', 0.6835345029830933)]
[('codeigniter', 0.829741358757019), ('angularjs', 0.8192479610443115), ('reactjs', 0.8067537546157837), ('nodejs', 0.7992503643035889), ('laravel', 0.7970592975616455), ('javascript', 0.7885481119155884), ('jsp', 0.7840495109558105), ('typescript', 0.7611822485923767), ('angular', 0.7578394412994385), ('servlet', 0.7560771703720093)]
[('kolkata', 0.6171277761459351), ('mumbai', 0.5856288075447083), ('ahmedabad', 0.5769046545028687), ('york', 0.5736351013183594), ('gurgaon', 0.5704740285873413), ('patna', 0.5687492489814758), ('jaipur', 0.5632768273353577), ('cuffe', 0.5632232427597046), ('holland', 0.5631478428840637), ('chandigarh', 0.5573863983154297)]
[('sem', 0.778308629989624), ('adwords', 0.7013415098190308), ('smm', 0.6959252953529358), ('keyword', 0.6559187173843384), ('facebook', 0.6039581298828125), ('offpage', 0.596062183380127), ('google', 0.592447996139526

  if np.issubdtype(vec.dtype, np.int):


**very interesting results and accurate. able to give meaning to words and find similar words quite accurately.**


In [0]:
print(model.wv.most_similar(positive=['php'],negative=['react']))

print(model.wv.most_similar(positive=['php'], negative=['developer']))

[('hexaware', 0.3142259418964386), ('elara', 0.3086129426956177), ('java', 0.29551246762275696), ('resports', 0.2770785689353943), ('smartchem', 0.27201008796691895), ('tavant', 0.2686164975166321), ('plsql', 0.2671133577823639), ('mgmt', 0.2657425105571747), ('dba', 0.26518476009368896), ('steno', 0.2598947286605835)]
[('dreamweaver', 0.46657702326774597), ('typescript', 0.4595596194267273), ('wordpress', 0.4483383893966675), ('codeigniter', 0.44724395871162415), ('joomla', 0.4399409592151642), ('bootstrap', 0.43795493245124817), ('xhtml', 0.4359024465084076), ('sqlalchemy', 0.4312586784362793), ('theano', 0.42333900928497314), ('wamp', 0.4215264320373535)]


  if np.issubdtype(vec.dtype, np.int):


In [0]:
print(model.wv.most_similar(positive=['php',"developer"]))

[('ee', 0.7449934482574463), ('angularjs', 0.7440281510353088), ('angular', 0.7397264838218689), ('nodejs', 0.7349754571914673), ('laravel', 0.7345426082611084), ('reactjs', 0.7303684949874878), ('java', 0.7256536483764648), ('javascript', 0.7211507558822632), ('jquery', 0.7170737981796265), ('jdk', 0.7086907625198364)]


  if np.issubdtype(vec.dtype, np.int):


In [0]:
print(model.wv.most_similar(positive=['technical','content','writer']))

[('proofreading', 0.5895922780036926), ('editing', 0.5659694075584412), ('editor', 0.5290688872337341), ('writers', 0.5202639102935791), ('storyboarding', 0.512433648109436), ('blogs', 0.5017741918563843), ('scientific', 0.47466763854026794), ('freelance', 0.4717441499233246), ('creative', 0.47124624252319336), ('editorial', 0.471222847700119)]


  if np.issubdtype(vec.dtype, np.int):


In [0]:
!pip install pymagnitude

Collecting pymagnitude
[?25l  Downloading https://files.pythonhosted.org/packages/0a/a3/b9a34d22ed8c0ed59b00ff55092129641cdfa09d82f9abdc5088051a5b0c/pymagnitude-0.1.120.tar.gz (5.4MB)
[K     |████████████████████████████████| 5.4MB 2.6MB/s 
[?25hBuilding wheels for collected packages: pymagnitude
  Building wheel for pymagnitude (setup.py) ... [?25l[?25hdone
  Created wheel for pymagnitude: filename=pymagnitude-0.1.120-cp36-cp36m-linux_x86_64.whl size=135918205 sha256=7c65b83ac130c5ad30bd16dd21173d7952cb25acc212bbc96e42c73fc589bb6d
  Stored in directory: /root/.cache/pip/wheels/a2/c7/98/cb48b9db35f8d1a7827b764dc36c5515179dc116448a47c8a1
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.120


In [0]:
!python -m pymagnitude.converter -i word2vec.bin -o recruit_work2vec.magnitude

Loading vectors... (this may take some time)
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
Found 40141 key(s)
Each vector has 300 dimension(s)
Creating magnitude format...
Writing vectors... (this may take some time)
0% completed
1% completed
2% completed
3% completed
4% completed
5% completed
6% completed
7% completed
8% completed
9% completed
10% completed
11% completed
12% completed
13% completed
14% completed
15% completed
16% completed
17% completed
18% completed
19% completed
20% completed
21% completed
22% completed
23% completed
24% completed
25% completed
26% completed
27% completed
28% completed
29% completed
30% completed
31% completed
32% completed
33% completed
34% completed
35% completed
36% completed
37% completed
38% completed
39% completed
40% completed
41% completed
42% completed
43% completed
44% completed
45% completed
46% completed
47% completed
48% completed
49% completed
50% completed
51% completed
52% completed
53% completed
54% completed
55

In [0]:
!cp -f *.magnitude /content/drive/My\ Drive/Colab\ Notebooks

In [0]:
from pymagnitude import *
vectors = Magnitude("recruit_work2vec.magnitude")

In [0]:
vectors.query("php")  


In [0]:
vectors.distance("php", "react")

0.7534202

In [0]:
vectors.distance("php", "laravel")

0.6370882

In [0]:
vectors.distance("php", "html")

0.7811773

In [0]:
vectors.distance("reactjs", "angularjs")

0.41469398

In [0]:
vectors.most_similar("php", topn = 10) 

[('codeigniter', 0.82974136),
 ('angularjs', 0.81924796),
 ('reactjs', 0.80675375),
 ('nodejs', 0.79925025),
 ('laravel', 0.7970594),
 ('javascript', 0.78854823),
 ('jsp', 0.7840495),
 ('typescript', 0.7611822),
 ('angular', 0.75783944),
 ('servlet', 0.7560772)]

In [0]:
vectors.most_similar("smm", topn = 10) 

[('sem', 0.8494969),
 ('smo', 0.76118964),
 ('offpage', 0.7167748),
 ('seo', 0.6959251),
 ('adwords', 0.66350174),
 ('youtube', 0.66263306),
 ('moz', 0.66105896),
 ('blog', 0.64793134),
 ('facebook', 0.6454981),
 ('instagram', 0.64283144)]

In [0]:
vectors.most_similar("consultant", topn = 10) 

[('analyst', 0.6584687),
 ('specialist', 0.6039723),
 ('architect', 0.6027958),
 ('developer', 0.5863703),
 ('engineer', 0.5835866),
 ('associate', 0.58329296),
 ('manager', 0.5377058),
 ('administrator', 0.5163762),
 ('designer', 0.50284487),
 ('consulting', 0.502113)]

In [0]:
vectors.most_similar("infosys", topn = 10) 

[('wipro', 0.77553195),
 ('mindtree', 0.7282932),
 ('hcl', 0.71044606),
 ('tcs', 0.66721153),
 ('niit', 0.65500784),
 ('techmahindra', 0.62621486),
 ('capgemini', 0.6204146),
 ('edgeverve', 0.6078055),
 ('mphasis', 0.59439075),
 ('hexaware', 0.59093356)]

In [0]:
vectors.most_similar(positive=["react"], negative=["php"], topn = 10) 

[('occur', 0.33173835),
 ('way', 0.32613683),
 ('immediately', 0.32524073),
 ('even', 0.32145426),
 ('accept', 0.32012215),
 ('happen', 0.31664014),
 ('attractive', 0.31038317),
 ('you', 0.31027406),
 ('convey', 0.3092022),
 ('either', 0.3087658)]

In [0]:
vectors.most_similar("mca", topn = 50) 

[('bca', 0.7518543),
 ('llm', 0.7158724),
 ('sociology', 0.71338046),
 ('btech', 0.702787),
 ('jamia', 0.69066334),
 ('bcom', 0.68930256),
 ('pgdca', 0.68042755),
 ('makhanlal', 0.6744047),
 ('emba', 0.6644396),
 ('mcm', 0.6641183),
 ('gniit', 0.6612295),
 ('milia', 0.6564143),
 ('humanities', 0.65105855),
 ('bahadur', 0.6493802),
 ('ramanand', 0.6472632),
 ('visva', 0.64393497),
 ('rani', 0.64336324),
 ('swami', 0.64317054),
 ('rabindra', 0.64095485),
 ('millia', 0.63558745),
 ('islamia', 0.6342078),
 ('chaturvedi', 0.6331053),
 ('chhatrapati', 0.62987936),
 ('guru', 0.6295793),
 ('none', 0.62857145),
 ('peeth', 0.62851757),
 ('phule', 0.62697613),
 ('durgavati', 0.62534714),
 ('banarasi', 0.62514323),
 ('jyotiba', 0.62487245),
 ('vidya', 0.62383825),
 ('maharaja', 0.6234404),
 ('shastri', 0.62086964),
 ('vellore', 0.62027454),
 ('bosco', 0.6194999),
 ('teerth', 0.61528885),
 ('llb', 0.61495376),
 ('jmi', 0.6148834),
 ('hnb', 0.61375105),
 ('shahu', 0.6137247),
 ('patrakarita', 0.6126

In [0]:
vectors.most_similar(["technical","writing"], topn = 50) 

[('authoring', 0.51194525),
 ('editing', 0.44519246),
 ('documenting', 0.42794028),
 ('debugging', 0.42628622),
 ('wrote', 0.42334825),
 ('drafting', 0.41287225),
 ('design', 0.4092698),
 ('designing', 0.40623605),
 ('preparing', 0.4028247),
 ('developing', 0.3974424),
 ('proofreading', 0.39664584),
 ('testing', 0.39558384),
 ('coding', 0.39323476),
 ('creating', 0.38408566),
 ('interpretation', 0.38349453),
 ('lld', 0.3826851),
 ('development', 0.37307978),
 ('storyboarding', 0.36437625),
 ('brd', 0.3636156),
 ('functional', 0.35793746),
 ('test', 0.35575283),
 ('scoping', 0.35502225),
 ('providing', 0.35456103),
 ('rendering', 0.35186306),
 ('clarification', 0.34830606),
 ('srs', 0.34826136),
 ('hld', 0.34714854),
 ('clarifications', 0.34710756),
 ('reviewing', 0.34613264),
 ('bid', 0.34541184),
 ('troubleshooting', 0.34332672),
 ('clarifying', 0.34212148),
 ('guides', 0.34208834),
 ('written', 0.34161514),
 ('gathering', 0.34115875),
 ('instructional', 0.33866072),
 ('detailed', 0.3

In [0]:
!pip install faiss-cpu

Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/c0/99/be4fe3f21363ac20aeaee9c8c25190a17564a44c15a1f3ddad4bc91a91f1/faiss_cpu-1.6.1-cp36-cp36m-manylinux2010_x86_64.whl (7.1MB)
[K     |████████████████████████████████| 7.1MB 7.2MB/s 
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.6.1


In [0]:
model.wv.vector_size

100

In [0]:
len(model.wv.vectors)

38850

In [0]:
model.wv.vectors[0]

array([ 0.04947353,  0.8093072 , -1.5084862 ,  0.3859996 , -0.5891598 ,
       -1.28317   ,  1.0157256 ,  0.05878324, -1.0800204 ,  0.31884453,
        0.05198415,  0.28576717, -0.7838786 , -0.4752583 , -1.3064339 ,
        0.7756314 ,  0.4573158 ,  0.48348138,  1.8336614 ,  0.54706967,
        1.0917104 ,  0.3705556 ,  1.620188  ,  1.1903225 ,  0.39083058,
        0.6945578 , -2.5658915 , -0.05567635, -0.27072707,  0.18569556,
        0.800959  , -1.1679168 ,  1.2730845 ,  2.2036574 ,  0.44798803,
       -1.0403408 ,  1.405665  , -0.542268  , -1.9141825 , -0.7173232 ,
       -0.12604828, -0.78838277,  1.4798317 ,  0.43526366,  1.6666065 ,
        1.6545597 ,  1.0986454 ,  1.3264595 ,  0.5552876 , -1.6065445 ,
        1.1134704 , -0.3797229 ,  0.07725742, -2.3687685 ,  1.56785   ,
       -0.09505743,  0.5081635 , -1.4753379 , -0.17331496, -0.56182075,
       -0.6614849 ,  1.0391992 ,  0.4665442 , -0.22024117,  1.2587055 ,
        2.6118913 , -1.5599632 ,  1.5545983 , -0.23409717, -0.64

In [0]:
d = model.wv.vector_size
xb = model.wv.vectors
import faiss                   # make faiss available
index = faiss.IndexFlatL2(d)   # build the index
print(index.is_trained)
index.add(xb)                  # add vectors to the index
print(index.ntotal)

True
38850


In [0]:


k = 4                          # we want to see 4 nearest neighbors
D, I = index.search(xb[:5], k) # sanity check
print(I)
print(D)

[[    0     6     2  2900]
 [    1     5     0  4298]
 [    2     0  1925   666]
 [    3     0  1878  5692]
 [    4 15306  6135  7502]]
[[  0.       50.15531 102.7367  107.00937]
 [  0.      130.99203 142.35297 144.80959]
 [  0.      102.7367  132.1215  141.99118]
 [  0.      321.36334 322.3022  334.03088]
 [  0.      173.00797 174.64133 177.45201]]


In [0]:
model.wv.get_vector("php")

array([-0.04613248, -0.45638925,  1.8460499 , -0.7213101 , -1.6085799 ,
        0.68357414, -1.0451503 ,  0.9148583 , -0.45257404,  3.1232643 ,
        0.6091577 ,  0.96650976,  1.637935  ,  2.0087838 ,  0.65790534,
       -0.09624057,  1.3752567 ,  0.05460932, -1.708172  , -1.9752477 ,
       -0.0912367 ,  0.10386667, -1.4072222 , -0.80064934, -0.3993024 ,
        1.3596958 ,  1.7711358 , -0.894949  ,  0.54681164,  0.4356679 ,
       -2.0824447 ,  1.6762347 , -1.2914523 , -2.1264014 ,  0.4642177 ,
       -0.05937846, -1.4999585 ,  0.44966957, -0.5558252 , -0.9031144 ,
       -2.221974  , -2.0833845 , -0.37344787,  1.7307651 ,  0.36681697,
       -1.7576789 ,  0.8327289 , -0.7084387 ,  0.4548789 ,  0.43216002,
        0.54699576,  0.2285981 ,  1.2644565 ,  1.2175053 , -1.2482632 ,
       -2.522233  ,  0.5352325 ,  0.17456388,  1.336694  ,  0.9545561 ,
       -0.5214188 , -0.44789225,  0.6813657 , -0.15566006, -1.6767237 ,
        0.46681294,  0.89690614, -2.697574  ,  1.3242853 ,  0.34

In [0]:
vector = model.wv.get_vector("php")
vec = np.array([vector], dtype="float32")

In [0]:
D, I = index.search(vec, k)     # actual search
print(D)                   # neighbors of the 5 first queries
print(I)                  # neighbors of the 5 last queries

[[ 0.       39.686104 42.725914 48.72763 ]]
[[2148 3089 4771 2423]]


In [0]:
# model.wv.vectors[2148]
model.wv.index2entity[2423]

'jsp'

In [0]:
ncentroids = 500
niter = 300
verbose = True
d = xb.shape[1]
kmeans = faiss.Kmeans(d, ncentroids, niter=niter, verbose=verbose)
kmeans.train(xb)
print(kmeans.obj)

[831130.2  690989.1  644611.75 616780.56 599327.6  588333.3  579581.3
 572227.3  566343.4  562246.3  558745.2  555605.25 552841.56 550970.94
 549268.3  547661.7  546331.8  545048.3  543855.8  543051.   542497.3
 542018.7  541583.6  541037.3  540371.44 539804.06 539287.75 538924.3
 538567.5  538290.5  538020.   537708.94 537502.2  537213.06 537045.2
 536943.7  536898.25 536857.56 536817.44 536720.2  536628.4  536521.3
 536428.75 536332.8  536272.44 536242.8  536211.6  536193.25 536161.44
 536129.3  536098.6  536072.94 536053.56 536038.8  535997.06 535965.25
 535944.94 535906.2  535885.1  535838.44 535764.25 535697.2  535682.1
 535659.75 535623.4  535602.06 535592.4  535587.4  535582.6  535577.94
 535574.56 535573.25 535567.7  535564.56 535561.   535559.9  535560.
 535559.9  535559.5  535558.6  535556.94 535556.9  535555.6  535555.5
 535554.4  535555.   535553.5  535553.4  535552.25 535550.9  535549.7
 535549.25 535547.6  535547.56 535545.75 535544.75 535543.56 535541.94
 535540.94 53554

In [0]:
kmeans.centroids.shape

(100, 100)

In [0]:
kmeans.centroids[0]

array([-0.6403173 ,  0.12590666, -0.4522747 ,  0.478508  ,  0.05913344,
       -0.5285335 ,  0.9253615 , -1.605499  , -1.3499982 ,  0.04387193,
        0.47889337, -0.55633086, -1.5902382 , -0.9601981 , -1.2196642 ,
        0.04297696,  0.11106821, -0.7255598 ,  1.1314338 ,  0.71783817,
       -0.4676995 ,  0.2948157 ,  1.6998808 , -0.4466212 ,  0.4560255 ,
       -0.48798868, -0.25705472,  0.27755412, -0.43301564,  1.2355976 ,
        0.9007798 , -1.4891129 ,  0.5505112 , -0.1322889 , -0.36323068,
        0.7829589 , -1.4704754 ,  0.874173  ,  0.86117923,  0.42668778,
        1.0614926 ,  0.69597113,  0.2918184 , -0.4276143 ,  0.01966534,
        0.28120044, -0.7812808 ,  0.71716756,  1.0747939 , -0.35778996,
        0.28013587,  0.924035  , -0.4612801 , -0.8049863 ,  0.36901656,
        0.26394466,  0.63946754, -1.7610573 , -0.46020746, -0.42213944,
        1.5886391 ,  0.6799998 , -0.16236989, -0.67978394,  0.6537865 ,
       -0.31152877, -1.5968024 , -0.14080092, -0.44846436,  0.93

In [0]:
D, I = index.search(kmeans.centroids, 5)
# print(D)
# print(I)

In [0]:
I2 = [ [ model.wv.index2entity[index]  for index in row] for row in I]
print(I2)

[['consignment', 'indenting', 'dispatching', 'consignments', 'fg'], ['indemnity', 'conveyance', 'stamp', 'esop', 'epfo'], ['filezilla', 'greenplum', 'powerbuilder', 'solman', 'sever'], ['periodical', 'periodically', 'fortnightly', 'actuals', 'commentary'], ['chhattisgarh', 'kerala', 'chandigarh', 'odisha', 'goa'], ['incisive', 'possessing', 'multitasking', 'knowledgeable', 'curious'], ['washers', 'barrel', 'motorized', 'dumpers', 'dozers'], ['faults', 'exceptions', 'outages', 'outage', 'failed'], ['subcontractor', 'excavation', 'finishes', 'submittals', 'ducting'], ['exams', 'admissions', 'academics', 'youth', 'lectures'], ['kannur', 'ujjain', 'rai', 'shankar', 'nainital'], ['metadata', 'formatting', 'conditional', 'formulas', 'csv'], ['angularjs', 'nodejs', 'struts', 'mongodb', 'reactjs'], ['smtp', 'config', 'cache', 'sftp', 'telnet'], ['demonstrate', 'seek', 'accept', 'adopt', 'choose'], ['finances', 'countermeasures', 'revisions', 'mechanisms', 'baseline'], ['professionalism', 'stri