## Intra-Perfume Entity Linking

In [112]:
# !pip install rltk

### Construct RLTK Datasets

In [113]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [114]:
'''
Feel free to add more columns here for use in record linkage.
'''

class Perfume(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
#         self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']
    
    @rltk.cached_property
    def node_id(self):
        return self.raw_object['node_id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def brand(self):
        return self.raw_object['brand']
    
    @rltk.cached_property
    def url(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def price(self):
        return (self.raw_object['price'])
    
    @rltk.cached_property
    def scent(self):
        return self.raw_object['scent']


In [115]:
dir_ = '../data/entity_linking/'
file1 = dir_ + 'allNodes.csv'
file2 = dir_ + 'allNodes.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(file1, encoding='utf-8')),record_class=Perfume)
ds2 = rltk.Dataset(rltk.CSVReader(open(file2, encoding ='utf-8')),record_class=Perfume)

In [116]:
# print some entries
print(ds1.generate_dataframe().head(5))
print('\n*********************************************\n')
print(ds2.generate_dataframe().head(5))

  id node_id                   name   brand  \
0  0      n1                   No.5  chanel   
1  1      n2          Chance Chanel  chanel   
2  2      n3            CHANEL No 5  chanel   
3  3      n4            Chanel No 5  chanel   
4  4      n5  Chanel Bleu De Chanel  chanel   

                                                 url   price   scent  
0  https://www.amazon.com//sspa/click?ie=UTF8&spc...      30   Apple  
1  https://www.amazon.com//Chance-Chanel-Tendre-W...  148.98    NULL  
2  https://www.amazon.com//CHANEL-No-Eau-Parfum-1...    19.9    NULL  
3  https://www.amazon.com//Chanel-Parfum-Spray-Pe...  133.89   Fresh  
4  https://www.amazon.com//Chanel-Toilette-Spray-...     108  Citrus  

*********************************************

  id node_id                   name   brand  \
0  0      n1                   No.5  chanel   
1  1      n2          Chance Chanel  chanel   
2  2      n3            CHANEL No 5  chanel   
3  3      n4            Chanel No 5  chanel   
4  4    

### sameAs - Entity Linking

In [117]:
def name_string_similarity_1(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    if s1 == s2:
        return 1
    
    return 0

In [118]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.6 # this number is just an example, you need to change it

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    
    total = 0.7 * score_1 + 0.3 * score_2
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

##### Save predictions

In [119]:
predictions = []
true_predictions = []


candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if r1.brand == r2.brand and r1.node_id != r2.node_id:
        result, confidence = rule_based_method(r1, r2)
#         print(result, confidence)
        predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])
        if result is True:
            true_predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])


In [120]:
len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(567036, 2739, 2739)

In [121]:
import pandas as pd
import numpy as np

predictions_df = pd.DataFrame(data=np.array(predictions), columns=['node_id1', 'name1', 'url1',\
                                                                   'node_id2', 'name2', 'url2',\
                                                                   'result', 'confidence'])


In [122]:
predictions_df.to_csv(dir_ + 'sameAs_predictions.csv')

In [123]:
true_predictions_df = pd.DataFrame(data=np.array(true_predictions), columns=['node_id1', 'name1', 'url1',\
                                                                   'node_id2', 'name2', 'url2',\
                                                                   'result', 'confidence'])

In [124]:
# true_predictions_df

In [125]:
true_predictions_df.to_csv(dir_ + 'sameAs_TRUE_predictions.csv')

### haveSimilarPrices Entity Linking

In [72]:
true_predictions_sp = []

candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if (r2.price.isdigit() and r1.price.isdigit()) and \
    (float(r2.price) >= float(r1.price)-5 and float(r2.price) <= float(r1.price)+5)\
    and r1.node_id != r2.node_id:
        true_predictions_sp.append([r1.node_id, r1.name, r1.price, r2.node_id, r2.name, r2.price])


In [74]:
true_predictions_sp_df = pd.DataFrame(data=np.array(true_predictions_sp), columns=['node_id1', 'name1', 'price1',\
                                                                   'node_id2', 'name2', 'price2'])

In [75]:
true_predictions_sp_df.to_csv(dir_ + 'haveSimilarPrices_TRUE_predictions.csv')

### haveSimilarScents Entity Linking

In [76]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting torchvision
  Downloading torchvision-0.14.0-cp39-cp39-win_amd64.whl (1.1 MB)
     ---------------------------------------- 1.1/1.1 MB 8.7 MB/s eta 0:00:00
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
     ---------------------------------------- 5.5/5.5 MB 29.1 MB/s eta 0:00:00
Collecting torch>=1.6.0
  Downloading torch-1.13.0-cp39-cp39-win_amd64.whl (167.2 MB)
     ------------------------------------- 167.2/167.2 MB 10.1 MB/s eta 0:00:00
Collecting sentencepiece
  Using cached sentencepiece-0.1.97-cp39-cp39-win_amd64.whl (1.1 MB)
Collecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
     ------------------------------------- 182.1/182.1 kB 10.7 MB/s eta 0:00:00
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-win_amd64.whl (3.3 MB)
     -----------

In [100]:
allNodes_df = ds1.generate_dataframe()
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='NULL']
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='None']
allNodes_df

Unnamed: 0,id,node_id,name,brand,url,price,scent
0,0,n1,No.5,chanel,https://www.amazon.com//sspa/click?ie=UTF8&spc...,30,Apple
3,3,n4,Chanel No 5,chanel,https://www.amazon.com//Chanel-Parfum-Spray-Pe...,133.89,Fresh
4,4,n5,Chanel Bleu De Chanel,chanel,https://www.amazon.com//Chanel-Toilette-Spray-...,108,Citrus
6,6,n7,Chance,chanel,https://www.amazon.com//Chance-Toilette-Spray-...,115,Floral
7,7,n8,CHANEL Chance,chanel,https://www.amazon.com//Chanel-Chance-Eau-Parf...,149.99,Citrus
...,...,...,...,...,...,...,...
1986,1985,n1986,Guilty Pour Homme Parfum,gucci,https://www.sephora.com/product/gucci-guilty-p...,163,Woody Spices
1987,1986,n1987,Gucci Bloom Ambrosia di Fiori,gucci,https://www.sephora.com/product/gucci-gucci-bl...,35,Classic Florals
1988,1987,n1988,Guilty Pour Femme,gucci,https://www.sephora.com/product/guilty-P273900...,97,Warm Florals
1989,1988,n1989,Bloom,gucci,https://www.sephora.com/product/bloom-eau-de-p...,35,Classic Floral


In [101]:
scents = list(allNodes_df['scent'])
node_ids = list(allNodes_df['node_id'])

In [102]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = scents

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [103]:
embeddings.shape

(1300, 384)

In [88]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import numpy as np
import torch

In [107]:
# Perform kmean clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append((node_ids[sentence_id], scents[sentence_id]))

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
[('n190', 'Fruity and floral accords'), ('n200', 'Fruity Floral'), ('n216', 'Fruity, Floral, Fresh'), ('n264', 'Fruity, Floral'), ('n324', 'Fruity, Floral'), ('n527', 'Floral Fruity Gourmand'), ('n587', 'Floral Fruity Gourmand'), ('n650', 'Fruity, Floral'), ('n766', 'Fruity floral'), ('n826', 'Fruity floral'), ('n870', 'Floral fruity'), ('n909', 'Floral,Pineapple'), ('n944', 'Floral Fruity'), ('n959', 'Floral Fruity'), ('n969', 'Floral,Pineapple'), ('n1004', 'Floral Fruity'), ('n1019', 'Floral Fruity'), ('n1036', 'Fruity, Floral, Fresh'), ('n1049', 'Green Floral Fruity'), ('n1221', 'Fruity and floral.'), ('n1228', 'Fruity Floral Oriental'), ('n1239', 'Fruity Floral Oriental'), ('n1320', 'Fruity Floral'), ('n1380', 'Fruity Floral'), ('n1425', 'Fruity Floral'), ('n1530', 'Fresh Fruity Florals'), ('n1533', 'Fruity Floral'), ('n1540', 'Fresh Fruity Florals'), ('n1545', 'Fruity Florals'), ('n1552', 'Fresh Fruity Florals'), ('n1574', 'Fresh Fruity Florals'), ('n1668', 'Fruity Flor

In [108]:
true_predictions_ss = []

for cluster in clustered_sentences:
    
    #cross multiply in one cluster
    for i, leftnode in enumerate(cluster):
        for j, rightnode in enumerate(cluster):
            if i != j:
                true_predictions_ss.append([leftnode[0], leftnode[1], rightnode[0], rightnode[1]])
            

In [110]:
# true_predictions_ss
true_predictions_ss_df = pd.DataFrame(data=np.array(true_predictions_ss), columns=['node_id1', 'scent1',\
                                                                   'node_id2', 'scent2'])

In [111]:
true_predictions_ss_df.to_csv(dir_ + 'haveSimilarScents_TRUE_predictions.csv')