<a href="https://colab.research.google.com/github/pgosar/AlphaHacks/blob/main/FoodBrandEmbedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fire
!pip install wikipedia

In [3]:
import io
import os
import shutil
import re
import string
import tensorflow as tf
import numpy as np

import logging
import wikipedia
import random

import json
import google

import nltk
from nltk.corpus import stopwords

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [3]:
glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_embeddings = tf.keras.utils.get_file("glove.6B.zip", glove_url, extract = True, cache_dir='.',
                                  cache_subdir='')

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
GLOVE_PATH = "/content/glove.6B.200d.txt"

In [6]:
class GloveEmbeddings:
    GLOVE_DIR = GLOVE_PATH
    EMBEDDING_DIM = 200

    @staticmethod
    def get_dict_word_embedding(path=GLOVE_DIR, embedding_dim=EMBEDDING_DIM):
        f = open(path.format(dim=embedding_dim))

        word2emb = dict()
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            word2emb[word] = coefs
        f.close()
        return word2emb

In [None]:
GloveEmbeddings.get_dict_word_embedding()

In [7]:
IGNORE_WORDS = set(stopwords.words())

In [None]:
_brand_list_fpath = "/content/data/brand_list2.txt"
DEFAULT_SET_BRANDS = set()
with open(_brand_list_fpath) as fp:
    for line in fp.readlines():
        line = line.strip()
        if not line:
            continue
        DEFAULT_SET_BRANDS.add(line)

In [None]:
DEFAULT_BRAND_EMB_SAVE_FPATH = 'data/brand_emb.json'
ENV_EMBEDDING_GLOVE_6B_FPATH = 'data/glove_embeddings/glove.6B.200d.txt'

In [None]:
DEFAULT_SET_BRANDS

In [11]:
logger = logging.getLogger(__name__)

#LOAD DATA

In [5]:
title_api = "https://en.wikipedia.org/w/api.php?action=query&format=json&titles="
search_api = "https://en.wikipedia.org/w/api.php?action=opensearch&limit=1&namespace=0&format=json&search="
pageid_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts|revisions&rvslots=*&rvprop=content&format=json&pageids="
summary_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exsentences=10&exlimit=1&explaintext=1&format=json&pageids="
summary_api_title = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exsentences=10&exlimit=1&explaintext=1&format=json&titles="
page_content_api = "https://en.wikipedia.org/w/api.php?action=query&prop=extracts|revisions&exlimit=1&explaintext=1&format=json&pageids="

brand_list_fp = "/content/cleaned_brand_list.json"
emb_save_fp = "embeddings.json"

In [7]:

def load_checkpoint(inPath):
  with open(inPath) as fp:
    temp_dict = json.load(fp)
  return temp_dict

brand_dict = load_checkpoint(brand_list_fp)

In [8]:
import fire
import json
import codecs
import requests
from string import punctuation

In [16]:
test_dict = {k: brand_dict[k] for k in list(brand_dict)[:100]}

In [23]:
def build_embeddings(brand_list = brand_dict, fpath_save = emb_save_fp, set_ignore_words = IGNORE_WORDS):
  logger.info("building knowledge base")
  dict_brand_name_emb = dict()

  wrd2emb = GloveEmbeddings.get_dict_word_embedding()

  for page, id in brand_dict.items():
    result = requests.get(page_content_api + id)
    json_data = result.json()

    content = json_data['query']['pages'][id]['extract']

    text_tokens = content.split()
    list_emb = list()
    
    for token in text_tokens:
      token = token.lower()
      token = token.strip(punctuation)
      if token in set_ignore_words:
        #logger.info("Token ignored: {}".format(token))
        continue
          
      emb = wrd2emb.get(token, None)
      if emb is not None:
        list_emb.append(emb)
    
    brand_array = np.array(list_emb)
    brand_emb = brand_array.mean(axis=0)
    
    dict_brand_name_emb[page] = brand_emb.tolist()
    
  logger.info("saving knowledge base to: `{}`".format(fpath_save))
  with codecs.open(fpath_save, 'w', encoding='utf-8') as fp:
    json.dump(dict_brand_name_emb, fp, separators=(',', ':'), indent=4)

  logger.info("knowledge base compiled")
  print("knowledge base compiled")

In [24]:
build_embeddings()

knowledge base compiled


In [None]:
os.path.getsize("/content/data/brand_emb.json")/1000000

0.23748

In [None]:
wikipedia.summary("501".encode("ascii", "ignore"), auto_suggest = False)

'Year 501 (DI) was a common year starting on Monday (link will display the full calendar) of the Julian calendar. At the time, it was known as the Year of the Consulship of Avienus and Pompeius (or, less frequently, year 1254 Ab urbe condita). The denomination 501 for this year has been used since the early medieval period, when the Anno Domini calendar era became the prevalent method in Europe for naming years.\n\n'

In [9]:
import operator
def query(target_brand_name, top_n=None, kb_fpath=emb_save_fp, dict_kb=None):

    if type(target_brand_name) == str:
        target_brand_name = str(target_brand_name)

    if dict_kb is None:
        with codecs.open(kb_fpath, encoding='utf-8') as fp:
            dict_kb = json.load(fp)

    target_brand_emb = np.array(dict_kb[target_brand_name])

    dict_brand_name_emb_distance = dict()
    for candidate_brand_name, candidate_emb in dict_kb.items():

        if candidate_brand_name == target_brand_name:
            continue

        emb_dist = np.linalg.norm(target_brand_emb - np.array(candidate_emb))
        dict_brand_name_emb_distance[candidate_brand_name] = emb_dist

    sorted_dict = sorted(dict_brand_name_emb_distance.items(), key=operator.itemgetter(1))

    if top_n:
        sorted_dict = sorted_dict[: top_n]

    logger.debug("{}: {}".format(target_brand_name, sorted_dict))

    return sorted_dict


In [None]:
brand_dict["Nestlé"]

In [14]:
query("Westminster Cracker Company", top_n = 20)

[("French's", 1.0210970768265688),
 ("Burton's Biscuit Company", 1.0369832046649143),
 ("Isaly's", 1.0587993738818866),
 ('Schmidt Baking Company', 1.0620453866584048),
 ('F._Duerr_%26_Sons', 1.0673729099433862),
 ('Ginsters', 1.080721504447736),
 ('Whetstone Chocolates', 1.087312435942856),
 ('McKee Foods', 1.0997297622283482),
 ("Elmer's Fine Foods", 1.1036887116373064),
 ('The Warrell Corporation', 1.1107880451024428),
 ("McCowan's", 1.1112232406687914),
 ("Murray's Cheese", 1.115138164475348),
 ('Wise Foods', 1.1208526723909311),
 ('HP Hood', 1.1222385065330829),
 ('Dare Foods', 1.1274152903235632),
 ('Tootsie Roll Industries', 1.133117027793418),
 ('Just Born', 1.135243687418404),
 ('Baxters', 1.1405863809937007),
 ("Peter's Food Services", 1.1423701391898216),
 ('Warburtons', 1.1465846598603804)]

In [40]:
os.path.getsize(emb_save_fp)/1000000

6.66232

In [None]:
def query_list(list_target_brand_name, top_n=None, kb_fpath=DEFAULT_BRAND_EMB_SAVE_FPATH):

    with codecs.open(kb_fpath, encoding='utf-8') as fp:
        dict_kb = json.load(fp)

    dict_results = dict()
    for idx, target_brand_name in enumerate(list_target_brand_name, start=1):
        sorted_candidate_brands = query(target_brand_name, top_n=top_n, dict_kb=dict_kb)

        dict_results[target_brand_name] = sorted_candidate_brands

    return dict_results