## Intra-Perfume Entity Linking

In [61]:
# !pip install rltk

### Construct RLTK Datasets

In [65]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [66]:
'''
Feel free to add more columns here for use in record linkage.
'''

class Perfume(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
#         self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']
    
    @rltk.cached_property
    def node_id(self):
        return self.raw_object['node_id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']
    
    @rltk.cached_property
    def name_tokens(self):
        return set(tokenizer.tokenize(self.name))

    @rltk.cached_property
    def brand(self):
        return self.raw_object['brand']
    
    @rltk.cached_property
    def url(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def price(self):
        return (self.raw_object['price'])
    
    @rltk.cached_property
    def scent(self):
        return self.raw_object['scent']


In [68]:
dir_ = '../data/entity_linking/'
file1 = dir_ + 'allNodes.csv'
file2 = dir_ + 'allNodes.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(file1, encoding='utf-8')),record_class=Perfume)
ds2 = rltk.Dataset(rltk.CSVReader(open(file2, encoding ='utf-8')),record_class=Perfume)

In [69]:
# print some entries
print(ds1.generate_dataframe().head(5))
print('\n*********************************************\n')
print(ds2.generate_dataframe().head(5))

  id node_id                                name  \
0  0      n1                  Vince camuto amore   
1  1      n2                       Chance chanel   
2  2      n3                         Chanel no 5   
3  3      n4  Chânél coco mademoiselle for women   
4  4      n5              Mademoiselle for women   

                                name_tokens   brand  \
0                    {amore, camuto, Vince}  Chanel   
1                          {chanel, Chance}  Chanel   
2                           {5, Chanel, no}  Chanel   
3  {women, coco, Chânél, for, mademoiselle}  Chanel   
4                {for, Mademoiselle, women}  Chanel   

                                                 url   price         scent  
0  https://www.amazon.com//sspa/click?ie=UTF8&spc...      92  Floral,Fresh  
1  https://www.amazon.com//Chance-Chanel-Tendre-W...  147.24          NULL  
2  https://www.amazon.com//CHANEL-No-Eau-Parfum-1...    19.9          NULL  
3  https://www.amazon.com//Ch%C3%A2n%C3%A9l-Made

### sameAs - Entity Linking

In [70]:
def name_string_similarity_1(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name_tokens
    s2 = r2.name_tokens
    
    return rltk.jaccard_index_similarity(s1, s2)

In [71]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.5 # this number is just an example, you need to change it

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    
    total = 0.5 * score_1 + 0.5 * score_2
#     total = score_1
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

##### Save predictions

In [72]:
predictions = []
true_predictions = []


candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if r1.brand == r2.brand and r1.node_id != r2.node_id:
        result, confidence = rule_based_method(r1, r2)
#         print(result, confidence)
        predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])
        if result is True:
            true_predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])


In [73]:
len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(1868770, 10801, 10801)

In [74]:
import pandas as pd
import numpy as np

# predictions_df = pd.DataFrame(data=np.array(predictions), columns=['node_id1', 'name1', 'url1',\
#                                                                    'node_id2', 'name2', 'url2',\
#                                                                    'result', 'confidence'])


In [75]:
# predictions_df.to_csv(dir_ + 'sameAs_predictions.csv')

In [76]:
true_predictions_df = pd.DataFrame(data=np.array(true_predictions), columns=['node_id1', 'name1', 'url1',\
                                                                   'node_id2', 'name2', 'url2',\
                                                                   'result', 'confidence'])

In [77]:
true_predictions_df

Unnamed: 0,node_id1,name1,url1,node_id2,name2,url2,result,confidence
0,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n34,Vince camuto,https://www.amazon.com//Vince-Camuto-Parfum-Sp...,True,0.8
1,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n81,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
2,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n82,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
3,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n83,Vince camuto ciao,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,0.7215686274509804
4,n1,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,n130,Vince camuto amore,https://www.amazon.com//sspa/click?ie=UTF8&spc...,True,1.0
...,...,...,...,...,...,...,...,...
318115,n2902,Shalis woman,,n2077,Shalis,,True,0.7
318116,n2902,Shalis woman,,n2175,Shalis man,,True,0.65
318117,n2904,Almarasim oud perfume,,n774,Aljefri oud,,True,0.506096681096681
318118,n2904,Almarasim oud perfume,,n1764,Arabian oud perfume bussma,,True,0.5657509157509157


In [78]:
true_predictions_df.to_csv(dir_ + 'sameAs_TRUE_predictions_jaro_jaccard_0.5.csv')

PermissionError: [Errno 13] Permission denied: '../data/entity_linking/sameAs_TRUE_predictions_jaro_jaccard_0.5.csv'

### haveSimilarPrices Entity Linking

In [None]:
true_predictions_sp = []

candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if (r2.price.isdigit() and r1.price.isdigit()) and \
    (float(r2.price) >= float(r1.price)-5 and float(r2.price) <= float(r1.price)+5)\
    and r1.node_id != r2.node_id:
        true_predictions_sp.append([r1.node_id, r1.name, r1.price, r2.node_id, r2.name, r2.price])


In [None]:
true_predictions_sp_df = pd.DataFrame(data=np.array(true_predictions_sp), columns=['node_id1', 'name1', 'price1',\
                                                                   'node_id2', 'name2', 'price2'])

In [None]:
true_predictions_sp_df.to_csv(dir_ + 'haveSimilarPrices_TRUE_predictions.csv')

### haveSimilarScents Entity Linking

In [None]:
# !pip install -U sentence-transformers

In [None]:
allNodes_df = ds1.generate_dataframe()
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='NULL']
allNodes_df = allNodes_df.loc[allNodes_df['scent']!='None']
allNodes_df

In [None]:
scents = list(allNodes_df['scent'])
node_ids = list(allNodes_df['node_id'])

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentences = scents

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

In [None]:
embeddings.shape

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer, LoggingHandler, util, evaluation, models, InputExample
import logging
import os
import gzip
import csv
import random
import numpy as np
import torch

In [None]:
# Perform kmean clustering
num_clusters = 20
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append((node_ids[sentence_id], scents[sentence_id]))

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

In [None]:
true_predictions_ss = []

for cluster in clustered_sentences:
    
    #cross multiply in one cluster
    for i, leftnode in enumerate(cluster):
        for j, rightnode in enumerate(cluster):
            if i != j:
                true_predictions_ss.append([leftnode[0], leftnode[1], rightnode[0], rightnode[1]])
            

In [None]:
# true_predictions_ss
true_predictions_ss_df = pd.DataFrame(data=np.array(true_predictions_ss), columns=['node_id1', 'scent1',\
                                                                   'node_id2', 'scent2'])

In [None]:
true_predictions_ss_df.to_csv(dir_ + 'haveSimilarScents_TRUE_predictions.csv')