## Intra-Perfume Entity Linking

In [1]:
# !pip install rltk

### Construct RLTK Datasets

In [2]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [3]:
'''
Feel free to add more columns here for use in record linkage.
'''

class Perfume(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
#         self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']
    
    @rltk.cached_property
    def node_id(self):
        return self.raw_object['node_id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name))

    @rltk.cached_property
    def brand(self):
        return self.raw_object['brand']
    
    @rltk.cached_property
    def url(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def price(self):
        return (self.raw_object['price'])
    
    @rltk.cached_property
    def scent(self):
        return self.raw_object['scent']


In [4]:
dir_ = '../data/entity_linking/'
file1 = dir_ + 'allNodes.csv'
file2 = dir_ + 'allNodes.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(file1, encoding='utf-8')),record_class=Perfume)
ds2 = rltk.Dataset(rltk.CSVReader(open(file2, encoding ='utf-8')),record_class=Perfume)

In [5]:
# print some entries
print(ds1.generate_dataframe().head(5))
print('\n*********************************************\n')
print(ds2.generate_dataframe().head(5))

  id node_id                                name  \
0  0      n1                  Vince camuto amore   
1  1      n2                       Chance chanel   
2  2      n3                         Chanel no 5   
3  3      n4  Chânél coco mademoiselle for women   
4  4      n5              Mademoiselle for women   

                                name_tokens   brand  \
0                    {Vince, amore, camuto}  Chanel   
1                          {Chance, chanel}  Chanel   
2                           {5, no, Chanel}  Chanel   
3  {mademoiselle, coco, Chânél, for, women}  Chanel   
4                {Mademoiselle, women, for}  Chanel   

                                                 url   price         scent  
0  https://www.amazon.com//sspa/click?ie=UTF8&spc...      92  Floral,Fresh  
1  https://www.amazon.com//Chance-Chanel-Tendre-W...  147.24          NULL  
2  https://www.amazon.com//CHANEL-No-Eau-Parfum-1...    19.9          NULL  
3  https://www.amazon.com//Ch%C3%A2n%C3%A9l-Made

### sameAs - Entity Linking

In [6]:
def name_string_similarity_1(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name_tokens
    s2 = r2.name_tokens
    
    return rltk.jaccard_index_similarity(s1, s2)

In [7]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.5 # this number is just an example, you need to change it

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    
    total = 0.5 * score_1 + 0.5 * score_2
#     total = score_1
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

##### Save predictions

In [8]:
predictions = []
true_predictions = []


candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if r1.brand == r2.brand and r1.node_id != r2.node_id:
        result, confidence = rule_based_method(r1, r2)
#         print(result, confidence)
        predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])
        if result is True:
            true_predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])


In [9]:
len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(1751776, 10801, 10801)

In [10]:
import pandas as pd
import numpy as np

# predictions_df = pd.DataFrame(data=np.array(predictions), columns=['node_id1', 'name1', 'url1',\
#                                                                    'node_id2', 'name2', 'url2',\
#                                                                    'result', 'confidence'])


In [11]:
# predictions_df.to_csv(dir_ + 'sameAs_predictions.csv')

In [12]:
true_predictions_df = pd.DataFrame(data=np.array(true_predictions), columns=['node_id1', 'name1', 'url1',\
                                                                   'node_id2', 'name2', 'url2',\
                                                                   'result', 'confidence'])

In [13]:
true_predictions_df

Unnamed: 0,node_id1,name1,url1,node_id2,name2,url2,result,confidence
0,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n34,Vince camuto,https://www.amazon.com//Vince-Camuto-Parfum-Sp...,True,0.8
1,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n81,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
2,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n82,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
3,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n83,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
4,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n130,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,1.0
...,...,...,...,...,...,...,...,...
317779,s2902,Shalis woman,,s1210,Shalis colgne,,True,0.610897435897436
317780,s2902,Shalis woman,,s2077,Shalis,,True,0.7
317781,s2902,Shalis woman,,s2175,Shalis man,,True,0.65
317782,s2904,Almarasim oud perfume,,s774,Aljefri oud,,True,0.506096681096681


In [14]:
true_predictions_df.to_csv(dir_ + 'sameAs_TRUE_predictions_jaro_jaccard_0.5.csv')

### haveSimilarPrices Entity Linking

In [15]:
true_predictions_sp = []

candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if (r2.price.isdigit() and r1.price.isdigit()) and \
    (float(r2.price) >= float(r1.price)-5 and float(r2.price) <= float(r1.price)+5)\
    and r1.node_id != r2.node_id:
        true_predictions_sp.append([r1.node_id, r1.name, r1.price, r2.node_id, r2.name, r2.price])


In [16]:
true_predictions_sp_df = pd.DataFrame(data=np.array(true_predictions_sp), columns=['node_id1', 'name1', 'price1',\
                                                                   'node_id2', 'name2', 'price2'])

In [17]:
true_predictions_sp_df.to_csv(dir_ + 'haveSimilarPrices_TRUE_predictions.csv')

### haveSimilarScents Entity Linking

In [18]:
# !pip install -U sentence-transformers

In [19]:
allNodes_df = ds1.generate_dataframe()
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='NULL']
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='None']
allNodes_df

Unnamed: 0,id,node_id,name,name_tokens,brand,url,price,scent
0,0,n1,Vince camuto amore,"{Vince, amore, camuto}",Chanel,https://www.amazon.com//sspa/click?ie=UTF8&spc...,92,"Floral,Fresh"
7,7,n8,Chanel coco mademoiselle,"{mademoiselle, coco, Chanel}",Chanel,https://www.amazon.com//Ch%C3%A2n%C3%A9l-Madem...,2147483648,Rose
8,8,n9,Chanel coco mademoiselle,"{mademoiselle, coco, Chanel}",Chanel,https://www.amazon.com//Ch%C3%A2n%C3%A9l-Madem...,2147483648,Eau De Parfum
10,10,n11,Chance,{Chance},Chanel,https://www.amazon.com//Chance-Toilette-Spray-...,127.48,Floral
12,12,n13,Coco mademoiselle by chanel,"{Coco, by, mademoiselle, chanel}",Chanel,https://www.amazon.com//MADEMOISELLE-Chanel-Pa...,135.99,"Orange,Vanilla"
...,...,...,...,...,...,...,...,...
2512,2512,n2513,Guilty pour homme parfum,"{homme, parfum, Guilty, pour}",Gucci,https://www.sephora.com/product/gucci-guilty-p...,163,Woody Spices
2513,2513,n2514,Gucci bloom ambrosia di fiori,"{Gucci, ambrosia, di, fiori, bloom}",Gucci,https://www.sephora.com/product/gucci-gucci-bl...,35,Classic Florals
2514,2514,n2515,Guilty pour femme,"{femme, Guilty, pour}",Gucci,https://www.sephora.com/product/guilty-P273900...,97,Warm Florals
2515,2515,n2516,Bloom,{Bloom},Gucci,https://www.sephora.com/product/bloom-eau-de-p...,35,Classic Floral


In [20]:
scents = list(allNodes_df['scent'])
node_ids = list(allNodes_df['node_id'])

In [21]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = scents

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [22]:
embeddings.shape

(1944, 384)

In [23]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import numpy as np
import torch

In [24]:
# Perform kmean clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append((node_ids[sentence_id], scents[sentence_id]))

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
[('n84', 'Sweet'), ('n85', 'Sweet'), ('n86', 'Sweet'), ('n213', 'Sweet'), ('n214', 'Sweet'), ('n215', 'Sweet'), ('n265', 'Stella'), ('n266', 'Edt Spray'), ('n267', 'Edt Spray'), ('n268', 'Edt Spray'), ('n269', 'Edt Spray'), ('n331', 'Tangy'), ('n374', 'Sweet'), ('n400', 'MY WAY'), ('n441', 'Let Them Eat Cake'), ('n442', 'Dead Sexy'), ('n443', 'Song In D Minor'), ('n444', 'Hypnotic Poison'), ('n456', 'POISON GIRL'), ('n472', 'Bulletproof'), ('n473', 'Bulletproof'), ('n474', 'Bulletproof'), ('n478', 'Sweet'), ('n504', 'MY WAY'), ('n545', 'Let Them Eat Cake'), ('n546', 'Dead Sexy'), ('n547', 'Song In D Minor'), ('n548', 'Hypnotic Poison'), ('n560', 'POISON GIRL'), ('n576', 'Bulletproof'), ('n577', 'Bulletproof'), ('n578', 'Bulletproof'), ('n608', 'Miss Dior'), ('n655', '2017 New Version'), ('n705', 'Fl'), ('n729', 'Idole'), ('n760', 'Apricot'), ('n762', 'Apricot'), ('n788', 'Awaken Within'), ('n818', 'Fl'), ('n842', 'Idole'), ('n870', 'Apricot'), ('n872', 'Apricot'), ('n901', '

In [25]:
true_predictions_ss = []

for cluster in clustered_sentences:
    
    #cross multiply in one cluster
    for i, leftnode in enumerate(cluster):
        for j, rightnode in enumerate(cluster):
            if i != j:
                true_predictions_ss.append([leftnode[0], leftnode[1], rightnode[0], rightnode[1]])
            

In [26]:
# true_predictions_ss
true_predictions_ss_df = pd.DataFrame(data=np.array(true_predictions_ss), columns=['node_id1', 'scent1',\
                                                                   'node_id2', 'scent2'])

In [27]:
true_predictions_ss_df.to_csv(dir_ + 'haveSimilarScents_TRUE_predictions.csv')