In [1]:
import os
import sys

root_dir = os.path.join(os.getcwd(), '..')
sys.path.append(root_dir)

In [2]:
import gensim
import numpy as np
import pandas as pd

from flow_wmd.utils import *
from gensim.models import Word2Vec 
from gensim.models import KeyedVectors
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer



## 1. Load Data and Pretrained Embeddings

In [45]:
%%time

PATH = "../data/yelp_dataset/"
yelp_data = []
r_dtypes = {"review_id":str,
            "user_id":str,
            "business_id":str,
            "stars": np.int32, 
            "date":str,
            "text":str,
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32}
drop = ['review_id', 'user_id','useful', 'funny', 'cool']
#query = "date >= '2017-12-01'"

with open(f"{PATH}yelp_academic_dataset_review.json", "r") as f:
    reader = pd.read_json(f, orient="records", lines=True, dtype=r_dtypes, chunksize=1000)
    for chunk in reader:
        reduced_chunk = chunk.drop(columns=drop)
        #.query(query)
        yelp_data.append(reduced_chunk)
    
yelp_data = pd.concat(yelp_data, ignore_index=True)

CPU times: user 3min 35s, sys: 1min 35s, total: 5min 10s
Wall time: 6min 54s


In [46]:
stopword_list=stopwords.words('english')

In [47]:
len(sentences)

451826

In [19]:
yelp_business = pd.read_json(f"{PATH}yelp_academic_dataset_business.json", orient="records", lines=True)
yelp_business.shape

(160585, 14)

In [20]:
yelp_business = yelp_business[yelp_business.city.isin(["Portland", "Atlanta"])]
yelp_business.shape

(30815, 14)

In [21]:
yelp_merged = yelp_data.merge(yelp_business, on='business_id')

In [22]:
yelp_merged.shape

(451826, 17)

In [24]:
%%time

sentences = yelp_merged.text.astype('str').tolist()
tokenizer = ToktokTokenizer()

CPU times: user 84.9 ms, sys: 3.8 ms, total: 88.7 ms
Wall time: 88.1 ms


In [25]:
%%time 

sentences_clean=[remove_stopwords(r, stopword_list, tokenizer) for r in sentences]

CPU times: user 2min 20s, sys: 5 s, total: 2min 25s
Wall time: 2min 35s


In [26]:
%%capture --no-display
%%time 

sentences_clean=pd.Series(sentences_clean).apply(denoise_text)
sentences_clean=sentences_clean.apply(remove_special_characters)
sentences_clean=sentences_clean.apply(simple_lemmatizer)

In [27]:
%%time

sentences_clean=[remove_stopwords(r, stopword_list, tokenizer) for r in sentences_clean]

CPU times: user 1min 21s, sys: 2.36 s, total: 1min 23s
Wall time: 1min 26s


In [28]:
sentences_clean[0]

'even mad captain ahab touched starbuck goodness herman melville drivethru jammed vehicle pulled opted park go walkup window staff super friendly said hello ordered even though busy genuinely smiling look like enjoy job great teamwork think friendliest starbucks ever visited building architecturally pleasing catch eye every time driven past glad stopped time close freeway entrance making regular stop area'

In [29]:
%%time

sentences_tokenized = [w.lower() for w in sentences_clean]
sentences_tokenized = [tokenizer.tokenize(i) for i in sentences_tokenized]

CPU times: user 37.8 s, sys: 3.65 s, total: 41.5 s
Wall time: 45.4 s


In [30]:
print("Loading GoogleNews Vectors")
%time model = KeyedVectors.load_word2vec_format('../embeddings/GoogleNews-vectors-negative300.bin.gz', binary=True)

Loading GoogleNews Vectors
CPU times: user 52.4 s, sys: 4.68 s, total: 57.1 s
Wall time: 1min 2s


## 2. Phrase Data

In [31]:
PHRASING = True
MIN = 500
THRESHOLD = 200

In [32]:
%%time 

if PHRASING:
    sentences_phrased = get_phrases(sentences_tokenized, 
                                    min_count = MIN, 
                                    threshold = THRESHOLD)
    sentences_training = sentences_phrased
    
else:
    sentences_training = sentences_tokenized

CPU times: user 4min 8s, sys: 8.89 s, total: 4min 17s
Wall time: 4min 27s


In [33]:
print(sentences_training[0])

['even', 'mad', 'captain', 'ahab', 'touched', 'starbuck', 'goodness', 'herman', 'melville', 'drivethru', 'jammed', 'vehicle', 'pulled', 'opted', 'park', 'go', 'walkup_window', 'staff', 'super_friendly', 'said_hello', 'ordered', 'even_though', 'busy', 'genuinely', 'smiling', 'look', 'like', 'enjoy', 'job', 'great', 'teamwork', 'think', 'friendliest', 'starbucks', 'ever', 'visited', 'building', 'architecturally', 'pleasing', 'catch_eye', 'every', 'time', 'driven_past', 'glad_stopped', 'time', 'close_freeway', 'entrance', 'making', 'regular', 'stop', 'area']


In [34]:
print(sentences_training[1])

['worker', 'absolute_sweetest', 'work', 'dollar_tree', 'close', 'get', 'coffee', 'work', 'always', 'super', 'kind', 'sweet', 'love', 'starbucks']


In [35]:
print(sentences_training[2])

['super', 'slow', 'ive', 'never', 'waited', 'long_line', 'even', 'move', 'insane', 'better']


## 3. Finetune Embeddings

### 3.1. Initialize Loggers

In [36]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''
    def __init__(self):
        self.epoch = 1

    def on_epoch_begin(self, model):
        print(f"Epoch {self.epoch} starting.")

    def on_epoch_end(self, model):
        print(f"Epoch {self.epoch} ended.")
        self.epoch += 1
        
class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.losses = []

    def on_epoch_begin(self, model):
        print(f'Epoch: {self.epoch}', end='\t')

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch != 1:
            previous_loss = self.losses[self.epoch-2]
        else:
            previous_loss = 0
        self.losses.append(loss)
        difference = loss-previous_loss
        print(f'  Loss: {loss}  Difference: {difference}')
        self.epoch += 1

In [37]:
epoch_logger = EpochLogger()
loss_logger = LossLogger()

### 3.2. Initialize Model

In [38]:
SIZE = model.vector_size
WINDOW = 10
EPOCHS = 4
MIN_COUNT = 2
SG = 1
HS = 0
SEED = 42
LOSS = True
ALPHA = 0.01

In [39]:
%%time

model_ft = Word2Vec(vector_size= SIZE, 
                    window = WINDOW,
                    min_count= MIN_COUNT,
                    epochs=EPOCHS,
                    sg = SG,
                    hs = HS,
                    seed = SEED)
model_ft.build_vocab(sentences_training)
total_examples = model_ft.corpus_count
model_ft.build_vocab([list(model.key_to_index.keys())], update=True)

CPU times: user 9.3 s, sys: 311 ms, total: 9.61 s
Wall time: 9.8 s


### 3.3. Train Word2Vec

In [40]:
outfile = "../embeddings/yelp_w2v"

In [41]:
%%time

model_ft.train(sentences_training, 
               total_examples=total_examples,
               epochs=model_ft.epochs,
               callbacks=[loss_logger],
               compute_loss=LOSS,
               start_alpha = ALPHA)
model_ft.wv.save_word2vec_format(f"{outfile}.txt", binary=False)

Epoch: 1	  Loss: 64311736.0  Difference: 64311736.0
Epoch: 2	  Loss: 68005744.0  Difference: 3694008.0
Epoch: 3	  Loss: 68840552.0  Difference: 834808.0
Epoch: 4	  Loss: 69504992.0  Difference: 664440.0
CPU times: user 23min 46s, sys: 11.7 s, total: 23min 57s
Wall time: 8min 20s


### 3.4. Load Finetuned Vectors and Test

In [78]:
ft_vectors = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(f"{outfile}.txt")

In [86]:
ft_vectors.distance("citizen", "kane")

0.45544904470443726

In [87]:
model.distance("citizen", "kane")

0.9603805989027023

In [88]:
ft_vectors.distance("lord", "ring")

0.29060083627700806

In [89]:
model.distance("lord", "ring")

0.7185895442962646

In [90]:
ft_vectors.get_vector("citizen_kane")[:10]

array([ 0.03295106, -0.10795593, -0.10198571,  0.00111192,  0.00233896,
       -0.06005668,  0.10686377,  0.03378343, -0.08767647,  0.02884012],
      dtype=float32)