## Average Glove Embeddings

**Nov 17, 2019**

---

In [1]:
import numpy as np
import pandas as pd
import os
import sys
import csv
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src import constants
from src.models.avg_embeddings_model import preprocessing
import matplotlib.pyplot as plt
import time

In [2]:
train_dir = constants.TRAIN_DIR
clean_dir = constants.CLEAN_DIR
art_prefix = constants.Text_Prefix
img_prefix = constants.Media_Prefix
tag_types = ['org', 'place', 'subject','person']
article_summary = pd.read_csv(f'{train_dir}/{art_prefix}summary.csv')
image_summary =  pd.read_csv(f'{train_dir}/{img_prefix}summary.csv')
preview_dir = f'{constants.DATA_DIR}/preview'

glove_dir = f'{constants.DATA_DIR}/glove'

In [3]:
D = 50
glove_data_file = f'{glove_dir}/glove.6B.{D}d.txt'
words = pd.read_table(glove_data_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)
words_matrix = words.values

In [4]:
def vec(w):
    try:
        return words.loc[w].values
    except:
        return np.zeros(D)

def find_closest_words(word, n=5):
    v = vec(word)
    diff = words_matrix - v
    delta = np.sum(diff * diff, axis=1)
    closest_words = []
    for i in range(n):
        min_ind = np.argmin(delta)
        closest_words.append(words.iloc[min_ind].name)
        delta[min_ind] = 500
    return closest_words

In [5]:
article_summary.head(1)

Unnamed: 0,id,version,version_created,content_type,language,city,country,long_lat,title,headline,headline_extended,summary,full_text
0,5485474faec244b0881838c139b4ef10,5,2019-04-18T22:22:45Z,text,en,New York,United States,"[-74.00597, 40.71427]",FBN--NFL Schedule,Champion Patriots open vs. Steelers; 5 interna...,The NFL's 100th season will begin with its mos...,The NFL's 100th season will begin with its mos...,NEW YORK (AP) — The NFL's 100th season will be...


In [6]:
def average_embedding(sentence):
    avg_embeddings = np.zeros(D)
    num_words = len(sentence.split())
    count = 0
    missed_words = []
    for word in sentence.split():
        emb = vec(word)
        if np.linalg.norm(emb) > 1e-10:
            count += 1
        else:
            missed_words.append(word)
        avg_embeddings += vec(word)
    return avg_embeddings/num_words, num_words, count, missed_words

In [7]:
# all_embeddings = np.zeros(shape=(len(article_summary), D))
# all_missed_words = []
# start_time = time.time()
# for i in range(len(article_summary)):
#     if i % 1000 == 0 and i > 0:
#         time_it = time.time() - start_time
#         print(f'{i} of {len(article_summary)} in {time_it:.2f}s')
#     text = article_summary.iloc[i].headline
#     text_prep = preprocessing(text)
#     emb, total, count, missed_words = average_embedding(text_prep)
#     all_missed_words.append(missed_words)
#     all_embeddings[i] = emb/np.linalg.norm(emb)

In [8]:
# np.save('train_embeddings.npy', all_embeddings)

In [9]:
all_embeddings_load = np.load('train_embeddings.npy')
norms = 1/np.linalg.norm(all_embeddings_load, axis=1).reshape(-1,1)
repeated_norms = np.repeat(norms, D, axis=1)
norm_embeddings_load = np.multiply(repeated_norms, all_embeddings_load)

In [56]:
random_ind = np.random.randint(0, len(article_summary))
random_article = article_summary.iloc[random_ind].headline
print(random_article + '\n')
text_prep = preprocessing(random_article)
emb, total, count, missed_words = average_embedding(text_prep)
emb = emb.reshape(-1,1)/np.linalg.norm(emb)

# finding nearest neighbors
k = 5
scores = np.dot(norm_embeddings_load, emb).flatten()
scores[random_ind] = 0
top_k = np.argsort(-scores)[:k]

for ind in top_k:
    print(article_summary.iloc[ind].headline)

The Cruze cruises: GM assembly plant closing, maybe for good

The Latest: Last car comes off line at GM assembly plant
Land deals, incentives OK'd for new auto plant in Detroit
Detroit reaches land deals for new Fiat Chrysler plant plan
Work progressing on new Fiat Chrysler plant in Detroit
Nissan cuts back on more business at English plant


---