In [17]:
import rltk
import csv
from datetime import datetime
from dateutil import parser
import re
# You can use this tokenizer in case you need to manipulate some data
tokenizer = rltk.tokenizer.crf_tokenizer.crf_tokenizer.CrfTokenizer()

# Task 1: Entity Resolution

## Task 1.2: Datase Construction

In [20]:
def my_tokenizer(doc):
    return re.findall(r"\w+", doc)

In [93]:
class GoodRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['Title']

    @rltk.cached_property
    def name_tokens(self):
        return set(my_tokenizer(self.name_string))

    @rltk.cached_property
    def ISBN(self):
        return self.raw_object['ISBN']

    @rltk.cached_property
    def ISBN13(self):
        if not self.raw_object['ISBN13'].strip():
            return f"Good Item {self.id} has no valid ISBN13 record"
        return self.raw_object['ISBN13']

    @rltk.cached_property
    def author(self):
        return self.raw_object['FirstAuthor']
    
    @rltk.cached_property
    def publication_date(self):
        date = self.raw_object['PublishDate']
        try:
            return parser.parse(date).date()
        except:
            return None

    @rltk.cached_property
    def publication_year(self):
        try:
            return str(self.publication_date.year)
        except:
            return f"Good Item {self.id} has no valid publish year record"
    

class NobleRecord(rltk.Record):
    def __init__(self, raw_object):
        super().__init__(raw_object)
        self.name = ''

    @rltk.cached_property
    def id(self):
        return self.raw_object['ID']

    @rltk.cached_property
    def name_string(self):
        return self.raw_object['Title']
    
    @rltk.cached_property
    def name_tokens(self):
        return set(my_tokenizer(self.name_string))

    @rltk.cached_property
    def ISBN13(self):
        if not self.raw_object['ISBN13'].strip():
            return f"Noble Item {self.id} has no valid ISBN13 record"
        return self.raw_object['ISBN13']

    @rltk.cached_property
    def author(self):
        return self.raw_object['Author1']

    @rltk.cached_property
    def publication_date(self):
        date = self.raw_object['PublicationDate']
        try:
            return datetime.strptime(date,"%m/%d/%Y").date()
        except:
            return None

    @rltk.cached_property
    def publication_year(self):
        try:
            return str(self.publication_date.year)
        except:
            return f"Noble Item {self.id} has no valid publish year record"

In [94]:
good_file_path = 'goodreads.csv'
noble_file_path = 'barnes_and_nobles.csv'

good_ds = rltk.Dataset(rltk.CSVReader(open(good_file_path, encoding="UTF8")),record_class=GoodRecord)
noble_ds = rltk.Dataset(rltk.CSVReader(open(noble_file_path, encoding="UTF8")),record_class=NobleRecord)

## Task 1.2: Blocking

In [96]:
bg = rltk.HashBlockGenerator()

In [83]:
dev_set_file = 'dev.csv'
dev = []
with open(dev_set_file, encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if line_count == 0:
            columns = row
            line_count += 1
        else:
            dev.append(row)
    print(f'Column names are: {", ".join(columns)}')
    print(f'Processed {len(dev)} lines.')

Column names are: goodreads.ID, barnes_and_nobles.ID, label
Processed 297 lines.


In [97]:
name_tokens_block = bg.generate(
    bg.block(good_ds, function_=lambda r: ",".join(r.name_tokens)),
    bg.block(noble_ds, function_=lambda r: ",".join(r.name_tokens))
) #0.50,0.97

year_block = bg.generate(
    bg.block(good_ds, property_='publication_year'),
    bg.block(noble_ds, property_='publication_year')
) #0.23,0.80

author_block = bg.generate(
    bg.block(good_ds, property_='author', base_on=year_block),
    bg.block(noble_ds, property_='author', base_on=year_block)
) #0.69,0.79

ISBN_block = bg.generate(
    bg.block(good_ds, property_='ISBN13'),
    bg.block(noble_ds, property_='ISBN13')
) #0.12,0.55

In [98]:
candidate_pairs = list(rltk.get_record_pairs(good_ds, noble_ds, block=name_tokens_block))
for pair in rltk.get_record_pairs(good_ds, noble_ds, block=author_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)
for pair in rltk.get_record_pairs(good_ds, noble_ds, block=ISBN_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)

In [99]:
gt = rltk.GroundTruth()
C_ = 0
Sm = 0
Nm = 0
for row in dev:    
    r1 = good_ds.get_record(row[0])
    r2  = noble_ds.get_record(row[1])
    pair = (r1,r2)
    if pair in candidate_pairs:
        C_ += 1
    if row[-1] == '1':
        if pair in candidate_pairs:
            Sm += 1
        Nm += 1
        gt.add_positive(r1.raw_object['ID'], r2.raw_object['ID'])
    else:
        gt.add_negative(r1.raw_object['ID'], r2.raw_object['ID'])

rltk.Trial(gt)

<rltk.evaluation.trial.Trial at 0x196e616a800>

In [100]:
N = 3700*3966
C = len(candidate_pairs)
RR = C/N
PC = Sm/Nm
precision = Sm/C_
print(f"Reduction Ratio: {RR}")
print(f"Pairwise Completeness: {PC}")
print(f"Precision: {precision}")

Reduction Ratio: 0.0002757220155102152
Pairwise Completeness: 0.9701492537313433
Precision: 0.7386363636363636


In [101]:
with open('Ruijie_Rao_blocked.csv', 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(("goodreads.ID", "barnes_and_nobles.ID"))
        for r1, r2 in candidate_pairs:
            writer.writerow((r1.id, r2.id))

## Task 1.3: Entity Linkage

In [102]:
import math
from collections import Counter

In [103]:
def name_jaro_similarity(r1, r2):
    s1 = r1.name_string
    s2 = r2.name_string
    
    return rltk.jaro_winkler_similarity(s1, s2)

In [104]:
def cos_similarity(vec1,vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

In [105]:
def name_tokens_cos_similarity(r1, r2):
    vec1 = Counter(r1.name_tokens)
    vec2 = Counter(r2.name_tokens)
    return cos_similarity(vec1,vec2)

In [106]:
def author_tokens_cos_similarity(r1, r2):
    vec1 = Counter(my_tokenizer(r1.author))
    vec2 = Counter(my_tokenizer(r2.author))
    return cos_similarity(vec1,vec2)

In [107]:
def ISBN_equality(r1, r2):
    if r1.ISBN13 == r2.ISBN13:
        return 1
    
    return 0

In [108]:
def publish_year_equality(r1, r2):
    y1 = r1.publication_year
    y2 = r2.publication_year
    if y1 == y2:
        return 0.5
    else:
        try:
            _ = int(y1)
            _ = int(y2)
            return 0
        except:
            return 0.5

In [109]:
MY_TRESH = 0.83

def rule_based_method(r1, r2):
    if ISBN_equality(r1, r2)==1:
        return True,1
    score_1 = publish_year_equality(r1, r2)
    score_2 = name_tokens_cos_similarity(r1, r2)
    
    total = 0.2 * score_1 + 0.75 * score_2
    return total > MY_TRESH, total

In [110]:
trial = rltk.Trial(gt)
gt_candidate_pairs = rltk.get_record_pairs(good_ds, noble_ds, ground_truth=gt)
for r1, r2 in gt_candidate_pairs:
    result, confidence = rule_based_method(r1, r2)
    trial.add_result(r1, r2, result, confidence)

In [111]:
trial.evaluate()
print('Trial statistics based on Ground-Truth from development set data:')
print('precison:', trial.precision, 'recall:', trial.recall, 'f-measure:', trial.f_measure)
print(f'tp: {trial.true_positives:.06f} [{len(trial.true_positives_list)}]')
print(f'fp: {trial.false_positives:.06f} [{len(trial.false_positives_list)}]')
print(f'tn: {trial.true_negatives:.06f} [{len(trial.true_negatives_list)}]')
print(f'fn: {trial.false_negatives:.06f} [{len(trial.false_negatives_list)}]')

Trial statistics based on Ground-Truth from development set data:
precison: 0.9180327868852459 recall: 0.835820895522388 f-measure: 0.875
tp: 0.835821 [56]
fp: 0.021739 [5]
tn: 0.978261 [225]
fn: 0.164179 [11]


In [112]:
with open('Ruijie_Rao_predictions.csv', 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(("goodreads.ID", "barnes_and_nobles.ID", "prediction", "confidence"))
        gt_candidate_pairs = rltk.get_record_pairs(good_ds, noble_ds, ground_truth=gt)
        for r1, r2 in gt_candidate_pairs:
            result, confidence = rule_based_method(r1, r2)
            writer.writerow((r1.id, r2.id, int(result), confidence))

## Task 1.4: Prediction

In [113]:
candidate_pairs = list(rltk.get_record_pairs(good_ds, noble_ds, block=name_tokens_block))
for pair in rltk.get_record_pairs(good_ds, noble_ds, block=author_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)
for pair in rltk.get_record_pairs(good_ds, noble_ds, block=ISBN_block):
    if pair not in candidate_pairs:
        candidate_pairs.append(pair)

In [114]:
with open('Ruijie_Rao_valid_predictions.csv', 'w', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(("goodreads.ID", "barnes_and_nobles.ID"))
        for r1, r2 in candidate_pairs:
            result, confidence = rule_based_method(r1, r2)
            if result:
                writer.writerow((r1.id, r2.id))

# Task 2: Knowledge Representation

## Task 2.1: Construct Model

In [3]:
from rdflib import Graph, URIRef, Literal, XSD, Namespace, RDF, BNode, RDFS
import csv

In [115]:
good_books_id = set()
noble_books_id = set()
book_matchings_id = []
with open("Ruijie_Rao_valid_predictions.csv", encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    c = 0
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if c==0:
            c += 1
            continue
        good_books_id.add(row[0])
        noble_books_id.add(row[1])
        book_matchings_id.append((row[0],row[1]))
        c += 1

In [116]:
FOAF = Namespace('http://xmlns.com/foaf/0.1/')
SCHEMA = Namespace('https://schema.org/')
ISBN = Namespace('https://isbndb.com/book/')
DBPD = Namespace('https://dbpedia.org/ontology/')
MYNS = Namespace('http://dsci558.org/myfakenamespace#')

In [150]:
my_kg = Graph()
my_kg.bind('isbn', ISBN)
my_kg.bind('foaf', FOAF)
my_kg.bind('schema', SCHEMA)
my_kg.bind('dbpedia', DBPD)
my_kg.bind('myns', MYNS)

In [151]:
with open("goodreads.csv", encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if row[0] not in good_books_id:
            continue
        id = f"good_{row[0]}"
        if row[4].strip(): subject = URIRef(ISBN[row[4]])
        else: subject = MYNS[id]
        my_kg.add((subject, SCHEMA.archivedAt, Literal("https://www.goodreads.com/")))
        my_kg.add((subject, RDF.type, SCHEMA.Book))
        my_kg.add((subject, SCHEMA.name, Literal(row[1])))
        my_kg.add((subject, SCHEMA.description, Literal(row[2])))
        if row[3].strip(): my_kg.add((subject, SCHEMA.isbn, Literal(row[3])))
        if row[4].strip(): my_kg.add((subject, SCHEMA.gtin13, Literal(row[4])))
        my_kg.add((subject, SCHEMA.numberOfPages, Literal(int(row[5]), datatype=XSD.integer)))
        for i in [6,7,8]:
            if row[i].strip():
                my_kg.add((subject, SCHEMA.author, Literal(row[i], datatype=SCHEMA.Person)))
        # Rating
        my_kg.add((BNode("rating_"+id), RDF.type, SCHEMA.AggregateRating))
        my_kg.add((BNode("rating_"+id), SCHEMA.ratingValue, Literal(row[9], datatype=XSD.float)))
        my_kg.add((BNode("rating_"+id), SCHEMA.ratingCount, Literal(row[10], datatype=XSD.integer)))
        my_kg.add((subject, SCHEMA.aggregateRating, BNode("rating_"+id)))
        # Reviews
        my_kg.add((BNode("reviews_"+id), RDF.type, SCHEMA.Review))
        my_kg.add((BNode("reviews_"+id), SCHEMA.commentCount, Literal(int(row[11].replace(",","")), datatype=XSD.integer)))
        my_kg.add((subject, SCHEMA.review, BNode("reviews_"+id)))

        if row[12].strip(): my_kg.add((subject, SCHEMA.publisher, Literal(row[12], datatype=SCHEMA.Organization)))
        if row[13].strip(): my_kg.add((subject, SCHEMA.datePublished, Literal(row[13], datatype=SCHEMA.Date)))
        # Format
        if row[14].strip():  
            if len((row[14].split(" ")))==1:
                my_kg.add((subject, SCHEMA.bookFormat, SCHEMA[row[14]]))
            else:
                my_kg.add((subject, SCHEMA.bookFormat, Literal(row[14])))
        if row[15].strip():  my_kg.add((subject, SCHEMA.inLanguage, SCHEMA[row[15]]))

In [152]:
my_kg.add((MYNS["salesRank"], RDF.type, RDF.Property))
my_kg.add((MYNS["salesRank"], RDFS.subPropertyOf, DBPD.rank))
my_kg.add((MYNS["salesRank"], RDFS.domain, SCHEMA.Book))
my_kg.add((MYNS["salesRank"], RDFS.range, XSD.integer))

for price in ["paperbackPrice", "hardcoverPrice", "nookBookPrice", "audiobookPrice"]:
    my_kg.add((MYNS[price], RDF.type, RDF.Property))
    my_kg.add((MYNS[price], RDFS.subPropertyOf, SCHEMA.price))
    my_kg.add((MYNS[price], RDFS.domain, SCHEMA.Book))
    my_kg.add((MYNS[price], RDFS.range, XSD.dollar))

In [153]:
with open("barnes_and_nobles.csv", encoding='utf-8', errors="replace") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        if len(row) <= 1:
            continue
        if row[0] not in noble_books_id:
            continue
        if len(row)<17:
            row = row + ["" for i in range(17-len(row))]
        id = f"noble_{row[0]}"
        if row[6].strip(): subject = URIRef(ISBN[row[6]])
        else: subject = MYNS[id]
        my_kg.add((subject, RDF.type, SCHEMA['Book']))
        my_kg.add((subject, SCHEMA.archivedAt, Literal("https://www.barnesandnoble.com/")))
        my_kg.add((subject, SCHEMA.name, Literal(row[1])))
        if row[6].strip(): my_kg.add((subject, SCHEMA.gtin13, Literal(row[6])))
        if row[8].strip(): my_kg.add((subject, SCHEMA.numberOfPages, Literal(int(row[8]), datatype=XSD.integer)))
        for i in [2,3,4]:
            if row[i].strip():
                my_kg.add((subject, SCHEMA.author, Literal(row[i], datatype=SCHEMA.Person)))
        # Rating
        if row[11].strip(): 
            my_kg.add((BNode("rating_"+id), RDF.type, SCHEMA.AggregateRating))
            my_kg.add((BNode("rating_"+id), SCHEMA.ratingValue, Literal(row[12], datatype=XSD.float)))
            my_kg.add((BNode("rating_"+id), SCHEMA.ratingCount, Literal(row[11], datatype=XSD.integer)))
            my_kg.add((subject, SCHEMA['aggregateRating'], BNode("rating_"+id)))

        if row[5].strip(): my_kg.add((subject, SCHEMA.publisher, Literal(row[5], datatype=SCHEMA.Organization)))
        if row[7].strip(): my_kg.add((subject, SCHEMA.datePublished, Literal(row[7], datatype=SCHEMA.Date)))
        if row[9].strip(): my_kg.add((subject, SCHEMA.size, Literal(row[9], datatype=SCHEMA.SizeSpecification)))
        if row[10].strip(): my_kg.add((subject, MYNS.salesRank, Literal(int(row[10].replace(",","")), datatype=XSD.integer)))
        if row[13].strip(): my_kg.add((subject, MYNS.paperbackPrice, Literal(float(row[13].strip("$")), datatype=XSD.dollar)))
        if row[14].strip(): my_kg.add((subject, MYNS.hardcoverPrice, Literal(float(row[14].strip("$")), datatype=XSD.dollar)))
        if row[15].strip(): my_kg.add((subject, MYNS.nookBookPrice, Literal(float(row[15].strip("$")), datatype=XSD.dollar)))
        if row[16].strip(): my_kg.add((subject, MYNS.audiobookPrice, Literal(float(row[16].strip("$")), datatype=XSD.dollar)))
        

In [154]:
for r1_id, r2_id in book_matchings_id:
    r1 = good_ds.get_record(r1_id)
    try:
        _ = int(r1.ISBN13)
        r1_subject = ISBN[r1.ISBN13]
    except: r1_subject = MYNS["good_"+r1_id]
    r2  = noble_ds.get_record(r2_id)
    try:
        _ = int(r2.ISBN13)
        r2_subject = ISBN[r2.ISBN13]
    except: r2_subject = MYNS["noble_"+r2_id]
    my_kg.add((r1_subject, RDFS.seeAlso, r2_subject))
    my_kg.add((r2_subject, RDFS.seeAlso, r1_subject))

In [155]:
my_kg.serialize('Ruijie_Rao_model.ttl', format="turtle")

<Graph identifier=N4f6a8586e873425a899379da34fb8302 (<class 'rdflib.graph.Graph'>)>