<a href="https://colab.research.google.com/github/mewadashreya/Glove-Word_Embeddings/blob/main/Glove_Word_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import logging
import re
import numpy as np
import pandas as pd
from pprint import pprint
import logging
import datetime
import time
import glove
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
import spacy
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
! pip install glove_python



In [7]:
glove_corpus = glove.Corpus()

In [9]:
def get_logger():
    '''
        credits to: https://www.kaggle.com/ogrellier/user-level-lightgbm-lb-1-4480
    '''
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)
    return logger
    
logger = get_logger()

In [11]:
logger.info('Load the data')
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/tripadvisor_hotel_reviews.csv')

[INFO]2020-11-23 12:16:12,824:main:Load the data


In [12]:
df.shape

(20491, 2)

In [13]:
df.sample(5)

Unnamed: 0,Review,Rating
261,basic hotel basic needs hotel perfect young tr...,2
8278,visited stayed hesperia sant joan 2006 concern...,5
6851,excellence excellent just retruned delightful ...,5
2996,"save money mexico save money, spent week janua...",1
13193,absolutely splendid overall nicest hotel staye...,5


In [14]:
logger.info('data preprocessing')
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in (stop)])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    more_than_4letters = ' '.join(word for word in punc_free.split() if len(word)>4)
    return more_than_4letters
logger.info('data preprocessing')
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub("[^a-zA-Z]", " ", str(sent)) 
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        yield(sent) 

[INFO]2020-11-23 12:16:55,240:main:data preprocessing
[INFO]2020-11-23 12:16:55,242:main:data preprocessing


In [18]:
data = df.Review.values.tolist()
data_words = sent_to_words(data)
data_ready = [clean(doc).split() for doc in data_words]

In [19]:
logger.info('data preprocessing')
def process_words(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in data_ready:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])   
    return texts_out

data_ready = process_words(data_words)

[INFO]2020-11-23 12:18:15,683:main:data preprocessing


In [20]:

# Fit the co-occurrence matrix using a sliding window of 10 words.
t0 = time.time()
glove_corpus.fit(data_ready, window=10)

In [21]:
print("Dictionary length=%d" % (len(glove_corpus.dictionary),))
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))

Dictionary length=28216
Co-occurrence calculated in  10.1sec


In [22]:
glove_corpus.dictionary['city']

573

In [30]:
SENTENCE_LENGTH_MAX = 32
EMBEDDING_DIM=50
word_embedding = glove.Glove(no_components=EMBEDDING_DIM, learning_rate=0.05)

t0 = time.time()
glove_epochs, glove_threads = 20, 7 

word_embedding.fit(glove_corpus.matrix, epochs=glove_epochs, no_threads=glove_threads, verbose=True)

print("%d-d word-embedding created in %5.1fsec = %5.1fsec per epoch" % (
        EMBEDDING_DIM, (time.time()-t0), (time.time()-t0)/glove_epochs*glove_threads, ))

# Add the word -> id dictionary to the model to allow similarity queries.
word_embedding.add_dictionary(glove_corpus.dictionary)

Performing 20 training epochs with 7 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
50-d word-embedding created in  53.8sec =  18.8sec per epoch


In [31]:
word_embedding.most_similar('hotel')
print("Co-occurrence calculated in %5.1fsec" % (time.time()-t0, ))

[('remind', 0.8750900815488141),
 ('absolute', 0.8375983316564113),
 ('apart', 0.835686537220129),
 ('however', 0.8335667362385206)]

In [33]:
word_embedding.most_similar('room')

[('appear', 0.9143897810871614),
 ('spacious', 0.8795454449018199),
 ('complete', 0.8723709095732082),
 ('condition', 0.8711569416375088)]

In [34]:
word_embedding.most_similar('price')

[('reasonable', 0.9727740476397334),
 ('reasonably', 0.9366859659124736),
 ('cheap', 0.9275472933804338),
 ('rate', 0.9018685429047111)]

In [35]:
word_embedding.most_similar('location')

[('convenient', 0.9225870918038449),
 ('ideal', 0.9218977304189205),
 ('central', 0.9075697463116943),
 ('brilliant', 0.9038673333139456)]