In [1]:
# Load the libraries
import pandas as pd
import multiprocessing

# # Import from gensim
from gensim.models import Word2Vec

### Load in the data and create the required structure

In [2]:
# Load in the data
data = pd.read_csv('../../Datasets/yelp_labelled_processed/yelp_labelled_processed.csv')

# Replace non-string reviews with strings (this is jsut a quirck of this dataset becasue some are np.nan)
data['text'] = data['text'].apply(lambda review: str(review))

In [3]:
# Have a look at the first 5 rows
data['text'].head(n=5)

0    new rule waitingtable almostalways cant wait i...
1    giving twostar 'spretty rating might night new...
2    staying planet hollywood acrossstreet saw good...
3    foodgood price super expensive 8 buck extra la...
4    worse company deal horrible work bring truck b...
Name: text, dtype: object

In [4]:
# Convert each string into an array
sentences = data['text'].copy().apply(lambda string: string.split(' '))

In [5]:
# Have a look at the first 5 sentences
sentences[0:5]

0    [new, rule, waitingtable, almostalways, cant, ...
1    [giving, twostar, 'spretty, rating, might, nig...
2    [staying, planet, hollywood, acrossstreet, saw...
3    [foodgood, price, super, expensive, 8, buck, e...
4    [worse, company, deal, horrible, work, bring, ...
Name: text, dtype: object

Note: The text data has to be an array of tokens

### Create word2vec embeddings

In [6]:
# Train the word2vec word embeddings (200 dimensions)
word2vec_word_embeddings = Word2Vec(sentences = sentences,
                           sg = 0, # 0 for continuous bag of words model, 1 for skip-gram model
                           size = 100, # Dimensionality of the word vectors
                           window = 5, # Maximum distance between the current and predicted word within a sentence
                           workers = multiprocessing.cpu_count() - 2, # Use these many worker threads to train the model
                           iter = 10) # Number of iterations (epochs) over the corpus

#### Get some stats

In [7]:
# Check the number of tokens that have been trained for
len(word2vec_word_embeddings.wv.vocab.keys())

13525

In [8]:
# Check the dimensions of each token
word2vec_word_embeddings.wv.vector_size

100

#### Have a look at word simialrities

In [9]:
# Have a look at the words that are most similar to 'great'
word2vec_word_embeddings.wv.most_similar('great')

[('excellent', 0.7214541435241699),
 ('awesome', 0.7190586924552917),
 ('terrific', 0.6961654424667358),
 ('alsogreat', 0.6949751377105713),
 ('amazing', 0.681275486946106),
 ('greattoo', 0.6721410751342773),
 ('good', 0.6664098501205444),
 ('reallygreat', 0.6604778170585632),
 ('fantastic', 0.6590536236763),
 ('verygood', 0.6489591598510742)]

In [10]:
# Have a look at the words that are most similar to 'great'
word2vec_word_embeddings.wv.most_similar('bad')

[('terrible', 0.6349875926971436),
 ('notgood', 0.6148676872253418),
 ('reallybad', 0.6145668625831604),
 ('awful', 0.5781711339950562),
 ("n'tgood", 0.5546547174453735),
 ('horrible', 0.5532522797584534),
 ('poor', 0.522366464138031),
 ('horrid', 0.5187084078788757),
 ('sucked', 0.5163482427597046),
 ("n'tgreat", 0.48994702100753784)]

In [11]:
# Have a look at the words that are most similar to 'great'
word2vec_word_embeddings.wv.most_similar('burger')

[('burgerfry', 0.7724355459213257),
 ('onionring', 0.7235103845596313),
 ('hamburger', 0.7204596400260925),
 ('patty', 0.7153399586677551),
 ('cheeseburger', 0.704922616481781),
 ('poutine', 0.6989130973815918),
 ('veggieburger', 0.680663526058197),
 ('sweetpotatofry', 0.6724377870559692),
 ('burgergood', 0.6691707372665405),
 ('fry', 0.66410893201828)]

#### Have a look at the word vector

In [12]:
# Have a look at the word embedding for 'great'
print(word2vec_word_embeddings['great']);

[-0.40580493 -0.56351244  0.23609917 -0.01513045  1.62838     1.0313561
 -0.44239405 -0.58281815  0.6831085  -0.7265927   0.0178313   1.1415402
 -1.494441   -1.1155494   1.412092   -0.6732566   0.90160966 -0.42537177
 -0.9525316  -0.69489723 -1.0825447   1.0854671  -0.15120716 -0.5315553
  1.8485135  -0.61486644 -2.6897857  -0.06496173 -0.20794475  0.36508176
 -0.18880841 -0.19770506 -0.8825965   0.10867269 -1.3807807   0.81314355
  3.2359798   0.04574855 -0.5769091  -0.1692346  -0.4798137   1.3301435
  2.5049117  -0.6901831   0.17400044  0.7234964  -0.38093165 -1.7423817
  1.2908051   1.0040889   0.77163196  2.3254035   0.06841435  1.2510766
  1.0396721   0.01883189  0.80579334  2.0664282   0.97331715 -1.4259825
  1.7379603   3.5708036   0.66354644  1.0676291  -0.25653598 -0.88472915
 -0.23615608 -0.13539356 -0.31772816  1.1915215  -0.27259427  0.31048393
  1.1309578   0.6345404  -0.39328104 -2.6946926   0.39645115 -0.2220566
  0.28574163 -0.7570241  -1.3633295   0.37991208 -1.6802174

  
