## Dataset analysis & RLTK components construction

In [1]:
# !pip install rltk

Collecting rltk
  Using cached rltk-2.0.0a20-py3-none-any.whl (81 kB)
Collecting pyrallel.lib
  Using cached pyrallel.lib-0.0.10-py3-none-any.whl (24 kB)
Collecting multiprocess>=0.70
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
     -------------------------------------- 132.9/132.9 kB 2.6 MB/s eta 0:00:00
Collecting typing>=3.6
  Using cached typing-3.7.4.3-py3-none-any.whl
Collecting dill>=0.3
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
     -------------------------------------- 110.5/110.5 kB 6.3 MB/s eta 0:00:00
Installing collected packages: typing, dill, multiprocess, pyrallel.lib, rltk
  Attempting uninstall: dill
    Found existing installation: dill 0.3.4
    Uninstalling dill-0.3.4:
      Successfully uninstalled dill-0.3.4
Successfully installed dill-0.3.6 multiprocess-0.70.14 pyrallel.lib-0.0.10 rltk-2.0.0a20 typing-3.7.4.3


### Task 1-1. Construct RLTK Datasets

First, you need define how a single entry would like for each type of record (for each dataset)

In [2]:
import rltk
import csv

# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

In [23]:
'''
Feel free to add more columns here for use in record linkage.
'''

class Perfume(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
#         self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['_id']
    
    @rltk.cached_property
    def node_id(self):
        return self.raw_object['node_id']

    @rltk.cached_property
    def name(self):
        return self.raw_object['name']

    @rltk.cached_property
    def brand(self):
        return self.raw_object['brand']
    
    @rltk.cached_property
    def url(self):
        return self.raw_object['url']
    
    @rltk.cached_property
    def price(self):
        return self.raw_object['price']
    
    @rltk.cached_property
    def scent(self):
        return self.raw_object['scent']

# class NobleRecord(rltk.Record):
#     def __init__(self, raw_object):
#         super().__init__(raw_object)
#         self.name = ''

#     @rltk.cached_property
#     def id(self):
#         return self.raw_object['ID']

#     @rltk.cached_property
#     def name_string(self):
#         return self.raw_object['Title']
    
#     @rltk.cached_property
#     def name_tokens(self):
#         return set(tokenizer.tokenize(self.name_string))

In [24]:
dir_ = '../data/entity_linking/'
file1 = dir_ + 'allNodes.csv'
file2 = dir_ + 'allNodes.csv'

ds1 = rltk.Dataset(rltk.CSVReader(open(file1, encoding='utf-8')),record_class=Perfume)
ds2 = rltk.Dataset(rltk.CSVReader(open(file2, encoding ='utf-8')),record_class=Perfume)

You can load your csv files into RLTK using this method:

And we can inspect a few entries:

In [25]:
# print some entries
print(ds1.generate_dataframe().head(5))
print('\n*********************************************\n')
print(ds2.generate_dataframe().head(5))

  id node_id                   name   brand  \
0  0      n1                   No.5  chanel   
1  1      n2          Chance Chanel  chanel   
2  2      n3            CHANEL No 5  chanel   
3  3      n4            Chanel No 5  chanel   
4  4      n5  Chanel Bleu De Chanel  chanel   

                                                 url   price   scent  
0  https://www.amazon.com//sspa/click?ie=UTF8&spc...      30   Apple  
1  https://www.amazon.com//Chance-Chanel-Tendre-W...  148.98    NULL  
2  https://www.amazon.com//CHANEL-No-Eau-Parfum-1...    19.9    NULL  
3  https://www.amazon.com//Chanel-Parfum-Spray-Pe...  133.89   Fresh  
4  https://www.amazon.com//Chanel-Toilette-Spray-...     108  Citrus  

*********************************************

  id node_id                   name   brand  \
0  0      n1                   No.5  chanel   
1  1      n2          Chance Chanel  chanel   
2  2      n3            CHANEL No 5  chanel   
3  3      n4            Chanel No 5  chanel   
4  4    

### Task 1-2. Blocking

First, we'll load dev set to evaluate both blocking (Task 1-2) and entity linking (Task 1-3).

In [7]:
# dev_set_file = dir_ + 'dev.csv'
# dev = []
# with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     line_count = 0
#     for row in csv_reader:
#         if len(row) <= 1:
#             continue
#         if line_count == 0:
#             columns = row
#             line_count += 1
#         else:
#             dev.append(row)
#     print(f'Column names are: {", ".join(columns)}')
#     print(f'Processed {len(dev)} lines.')

# gt = rltk.GroundTruth()
# for row in dev:    
#     r1 = ds1.get_record(row[0])
#     r2  = ds2.get_record(row[1])
#     if row[-1] == '1':
#         gt.add_positive(r1.raw_object['ID'], r2.raw_object['ID'])
#     else:
#         gt.add_negative(r1.raw_object['ID'], r2.raw_object['ID'])

# rltk.Trial(gt)

Column names are: goodreads.ID, barnes_and_nobles.ID, label
Processed 297 lines.


<rltk.evaluation.trial.Trial at 0x1f36d23d0d0>

Then, you can build your own blocking techniques and evaluate it.

Hint:

- What is the total number of pairs without blocking? 
- what is the number of paris with blocking?
- After blocking, how many "correct" (matched) pairs presented in dev set?


### Task 1-3. Entity Linking

Here are 2 example functions for field (attribute) similarity:

In [19]:
def name_string_similarity_1(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    return rltk.jaro_winkler_similarity(s1, s2)
    
def name_string_similarity_2(r1, r2):
    ''' Example dummy similiary function '''
    s1 = r1.name
    s2 = r2.name
    
    if s1 == s2:
        return 1
    
    return 0

Here's how you can combine multiple similarity functions into a single weightened scoring function:

In [41]:
# threshold value to determine if we are confident the record match
MY_TRESH = 0.6 # this number is just an example, you need to change it

# entity linkage scoring function
def rule_based_method(r1, r2):
    score_1 = name_string_similarity_1(r1, r2)
    score_2 = name_string_similarity_2(r1, r2)
    
    total = 0.7 * score_1 + 0.3 * score_2
    
    # return two values: boolean if they match or not, float to determine confidence
    return total > MY_TRESH, total

Lets run some candidates using the ground-truth

In [37]:
# # trial = rltk.Trial(gt)
# # candidate_pairs = rltk.get_record_pairs(ds1, ds2, ground_truth=gt)
# candidate_pairs = rltk.get_record_pairs(ds1, ds2)
# count = 0
# for r1, r2 in candidate_pairs:
#     if r1.brand == r2.brand:
#         result, confidence = rule_based_method(r1, r2)
# #     trial.add_result(r1, r2, result, confidence)
#         print(result, confidence)

Now lets evaluate our trial results

In [38]:
# trial.evaluate()
# print('Trial statistics based on Ground-Truth from development set data:')
# print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
# print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
# print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
# print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

In [39]:
# trial.f_measure

### Save Test predictions
You will be evaluated on dev and test predictions, over a hidden ground truth.

In [40]:
# test_set_file = dir_ + 'test.csv'
# test = []
# with open(test_set_file, encoding='utf-8', errors="replace") as csv_file:
#     csv_reader = csv.reader(csv_file, delimiter=',')
#     line_count = 0
#     for row in csv_reader:
#         if len(row) <= 1:
#             continue
#         if line_count == 0:
#             columns = row
#             line_count += 1
#         else:
#             test.append(row)
#     print(f'Column names are: {", ".join(columns)}')
#     print(f'Processed {len(test)} lines.')

In [42]:
predictions = []


candidate_pairs = rltk.get_record_pairs(ds1, ds2)
for r1, r2 in candidate_pairs:
    if r1.brand == r2.brand:
        result, confidence = rule_based_method(r1, r2)
#         print(result, confidence)
        predictions.append([r1.node_id, r1.name, r1.url, r2.node_id, r2.name, r2.url, result, confidence])
    
# for id1, id2 in test:
#     r1 = ds1.get_record(id1)
#     r2  = ds2.get_record(id2)
#     result, confidence = rule_based_method(r1, r2)
#     predictions.append((r1.id, r2.id, result, confidence))

In [43]:
len(predictions), len(ds1.generate_dataframe()), len(ds2.generate_dataframe())

(569775, 2739, 2739)

In [44]:
# with open(dir_ + 'predictions.csv', mode='w', encoding='utf-8') as file:
#     writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
#     for row in predictions:
#         writer.writerow(row)

In [45]:
import pandas as pd
import numpy as np

predictions_df = pd.DataFrame(data=np.array(predictions), columns=['node_id1', 'name1', 'url1',\
                                                                   'node_id2', 'name2', 'url2',\
                                                                   'result', 'confidence'])


In [46]:
predictions_df.to_csv(dir_ + 'samAs_predictions.csv')