In [2]:


import warnings
warnings.filterwarnings('ignore')

#data processing
import pandas as pd
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

#modelling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser



Load the dataset

In [3]:
data=pd.read_csv('/content/sample_data/text.csv',header=None)

In [4]:
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


Preprocess and prepare the dataset

In [5]:


def pre_process(text):

    #convert to lowercase
    text = str(text).lower()

    #remove all special characters and keep only alpha numeric characters and spaces
    text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)

    #remove new lines
    text = re.sub(r'\n',r' ',text)

    # remove stop words
    text = " ".join([word for word in text.split() if word not in stopWords])

    return text



In [6]:
pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [7]:
data[0] = data[0].map(lambda x: pre_process(x))

In [8]:
data[0].head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


We know that each row in our data contains a set of sentences. So we split them by '.' and convert them into list i.e,

In [9]:
data[0][1].split('.')[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

Now, We have the data in a list. But we need to convert them into a list of lists. So, now again we split them by space ' '. i.e, First we split the data by '.' and then we split them by ' ' so that we can get our data in a list of lists:

In [10]:


corpus = []
for line in data[0][1].split('.'):
    words = [x for x in line.split()]
    corpus.append(words)



In [11]:
corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]



Convert the whole text in our dataset to a list of lists and build a corpus. Corpus is just the collection of vocabulary.


In [12]:


data = data[0].map(lambda x: x.split('.'))

corpus = []
for i in (range(len(data))):
    for line in data[i]:
        words = [x for x in line.split()]
        corpus.append(words)

corpus[:2]



[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

So we use gensim's Phrases functions which collect all the words which occur together and add an underscore between them. So now 'san francisco' becomes 'san_francisco'. We set the min_count parameter to 25 which implies we ignore all the words and bigrams which appears lesser than this.

In [13]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

In [14]:


for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]





As you can see below underscore has been added to the bigrams in our corpus:


In [15]:
corpus[111]

['connected', 'rivercenter', 'mall', 'downtown', 'san_antonio']

In [16]:
corpus[9]

['course', 'washington_dc']


Build the Model


In [25]:
size = 100
window_size = 2
epochs = 10
min_count = 2
workers = 4
sg = 1



Train the model:


In [26]:
model = Word2Vec(corpus,sg=1,window=window_size,vector_size=size, min_count=min_count,workers=workers,epochs=epochs)

In [28]:
import os

# Create the directory if it doesn't exist
os.makedirs('model', exist_ok=True)

# Then save the model
model.save('model/word2vec.model')


In [29]:
model.save('model/word2vec.model')

In [30]:
model = Word2Vec.load('model/word2vec.model')


Evaluate the Embeddings

After training the model, we evaluate them. Let us see what the model has been learned and how well it has understood the semantics of words. Genism provides a most_similar function which gives us top similar words related to the given word.

As you can see below, given san_deigo as an input we are getting all other related city names as most similar words:


In [32]:
model.wv.most_similar('san_diego')


[('san_antonio', 0.8421101570129395),
 ('baltimore', 0.8341695070266724),
 ('memphis', 0.8155943155288696),
 ('sd', 0.8152564764022827),
 ('austin', 0.8128693699836731),
 ('san_francisco', 0.8123235702514648),
 ('denver', 0.8117407560348511),
 ('indianapolis', 0.8107519149780273),
 ('dallas', 0.8054894208908081),
 ('phoenix', 0.8005744218826294)]

In [34]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)


[('queen', 0.7243518829345703)]

We can also find the words that do not match in the given set of words, for instance in the below list called text except the word holiday all others are city names and since our word2vec has understood the semantics of each word it returns the word holiday as the one that does not match with the other words in the list.

In [39]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']

model.wv.doesnt_match(text)


'holiday'