## Word Embeddings

In [2]:
import numpy as np
from scipy import spatial
from collections import Counter

PATH = "/home/sukhad/Workspace/Machine_learning/Embeddings/GloVe/glove.6B.100d.txt"

A word embedding is a learned representation for text where words that have the same meaning have a similar representation. We are using pretrained GloVe word embeddings. So for every word it returns a 100-dimensional vector.

In [3]:
embeddings_dict = dict()
with open(PATH,'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [12]:
# Representation of say, "boy"
print(embeddings_dict["boy"])

# This doesn't make any sense

[ 0.89461   0.37758   0.42067  -0.51334  -0.28298   1.0012    0.18748
  0.21868  -0.030053  0.086727 -0.44626   0.34755   0.61516   0.31583
  0.56543   0.40314   0.45648  -0.39995  -0.20669   0.84975  -0.016242
 -0.28152  -0.052972 -0.19438   0.35783   0.89044  -0.92628  -0.88332
  0.089802  1.3856    0.11884   0.36608   0.44009   0.51617   0.35074
 -0.24373  -0.68049   0.13276   0.78618  -0.5253   -0.28546   0.30342
  0.23909  -0.65424  -0.15926   0.20786  -0.69262   0.25893   0.65051
 -0.091278 -0.36572  -0.20873   0.18074   1.156    -0.01657  -1.9547
 -0.6729    0.029884  0.93844   0.74598   0.80694   1.0129   -0.51853
 -0.21152   0.35337   0.047413  0.75148   0.31402  -0.42139   0.29954
  0.32677   0.12179  -0.19638   0.12719   0.041816  1.05      0.17703
  0.08307  -0.51415  -0.6523    0.17147   0.37803   0.1107   -0.25518
 -1.0712   -0.51485  -0.38163  -0.42314   0.13503   0.014892  0.11282
  0.26315   0.20852  -0.016587 -0.40463  -0.15909  -0.71309  -0.13668
  0.04035  -0.84151 

### Similar Words have similar representation
For example, clothing and fashion are similar than clothing and phone

In [9]:
print(1 - spatial.distance.cosine(embeddings_dict["clothing"], embeddings_dict["fashion"]))
print(1 - spatial.distance.cosine(embeddings_dict["clothing"], embeddings_dict["phone"]))

0.6570013761520386
0.31514737010002136


<b>For our use, we are comparing every keyword, we parsed from search results with name of the node </b>
In case of bigrams, trigrams, etc. we compare keyword with every unigram present in the name ([clothings, accessories] in "clothings & accessories")

In [10]:
keywords = {'frequency', 'directory', 'accessories', 'fashion', 'choker', 'knieriem', 'addition', 'world', 'akanksharedhu', 'islands', 'devanshi', 'comment', 'view', 'dress', 'suit', 's', 'business', 'day', 'shipping', 'zivar', 'street', 'sites', 'happiness', 'tai', 'dresses', 'makeup', 'advisor', 'christina', 'pannone', 'piece', 'collection', 'gabbana', 'style', 'media', 'reply', 'influence', 'girl', 'authority', 'art', 'post', 'scarf', 'money', 'youtube', 'sense', 'hood', 'diamond', 'hardik', 'men', 'wpbeginner', 'mahadev', 'stuff', 'necklace', 'tips', 'theme', 'site', 'states', 'wear', 'min', 'outfit', 'shop', 'pm', 'trends', 'advertising', 'news', 'com', 'wikipedia', 'fusion', 'footwear', 'valentine', 'articles', 'logo', 'jewellery', 'jeans', 'comments', 'metals', 'december', 'wardrobe', 'credits', 'handbags', 'ways', 'life', 'hat', 'marketing', 'quality', 'brands', 'year', 'women', 'jewelry', 'clothes', 'bloggers', 'email', 'influencer', 'travel', 'food', 'blog', 'photo', 'co', 'outfits', 'pieces', 'courtesy', 'rings', 'lifestyle', 'tee', 'shoe', 'manufacturer', 'followers', 'beginner', 'maternity', 'leuba', 'website', 'page', 'string', 'beauty', 'community', 'traffic', 'shades', 'besasee', 'inquiries', 'look', 'voylla', 'domain', 'contact', 'products', 'industry', 'thanks', 'credit', 'man', 'bar', 'statement', 'lockdown', 'designers', 'boutique', 'earrings', 'content', 'october', 't', 'luxury', 'july', 'instagram', 'tags', 'monitoring', 'advice', 'bhaavika', 'people', 'clothing', 'posts', 'influencers', 'stories', 'trend', 'home', 'gloves', 'health', 'views', 'shopping', 'necklaces', 'brand', 'sunnies', 'york', 'summer', 'blogger', 'frock', 'sasee', 'details', 'step', 'hoops', 'designs', 'profile', 'design', 'accessory', 'bluehost', 'hair', 'favre', 'events', 'sun', 'delhi', 'size', 'account', 'blogs', 'gentleman', 'shoes'}
name = "clothings & accessories"
name_list = name.split(" & ")

In [11]:
_shit = {}
for key in keywords:
    score = []
    for name in name_list:
        try:
            score1 = (1 - spatial.distance.cosine(embeddings_dict[name], embeddings_dict[key]))
            score.append(score1)
        except:
            continue
    if not len(score)==0:
        _shit[key] = max(score)
Counter(_shit).most_common(20)

[('accessories', 1.0),
 ('clothing', 0.7651895880699158),
 ('jewelry', 0.7328066229820251),
 ('handbags', 0.7084283828735352),
 ('footwear', 0.7053003907203674),
 ('clothes', 0.6789366602897644),
 ('shoes', 0.6592445373535156),
 ('jewellery', 0.6519425511360168),
 ('products', 0.6193220019340515),
 ('accessory', 0.6174675822257996),
 ('designs', 0.6094177961349487),
 ('shoe', 0.5983057618141174),
 ('fashion', 0.5769087672233582),
 ('designers', 0.5686101317405701),
 ('wardrobe', 0.5628618597984314),
 ('brands', 0.5527617335319519),
 ('outfits', 0.5522340536117554),
 ('manufacturer', 0.5275525450706482),
 ('brand', 0.5219516754150391),
 ('luxury', 0.5181752443313599)]