In [45]:
# Utilities
import re
import os
# Preprocessing
from html import unescape
from fuzzywuzzy import fuzz
# Data management
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./data/book.txt", 
                   sep='\t', 
                   lineterminator='\n', 
                   header=None, 
                   names=["source", "isbn", "title", "authors"])

In [3]:
df.head()

Unnamed: 0,source,isbn,title,authors
0,eCampus.com,201853949,"The art Of Computer Programming, Fascicle 3: G...",Not Available\r
1,Indoo.com,201853949,"Art of Computer Programming, Volume 4, Fascicl...","Knuth, Donald E.\r"
2,textbookxdotcom,201853949,"The 'art Of Computer Programming, Fascicle 3 G...",\r
3,A1Books,201853949,"The Art of Computer Programming, Volume 4, Fas...","Knuth, Donald E.\r"
4,textbooksNow,201853949,Art of Computer Programming,Knuth\r


In [4]:
df.describe()

Unnamed: 0,source,isbn,title,authors
count,33971,33971,33968,33971
unique,895,1265,11095,9627
top,A1Books,321263588,Modern Database Management,\r
freq,2403,159,90,713


# Preprocessing

In [10]:
def clean(df, gs=False):
    if gs:
        df["authors"] = df["authors"].str.lower()
        df["authors"] = df["authors"].str.replace('[^\w\s]|\\r','')
        df.authors = df.authors.replace('\s+', ' ', regex=True)
        df.authors = df.authors.replace(r'\b\w\b', ' ', regex=True)
    else:
        # Unescape HTML left over from scraping
        df.authors = df.authors.apply(unescape)
        df.title = df.title.apply(lambda x: unescape(str(x)))
        # Lowercase 
        df["authors"] = df["authors"].str.lower()
        df["title"] = df["title"].str.lower()
        df["source"] = df["source"].str.lower()
        # Removes parenthesis from author list
        df["authors"] = df["authors"].str.replace('\((.*?)\)','')
        # Replaces | in author list with spaces
        df["authors"] = df["authors"].str.replace('\|',' ')
        # Special characters removal (removes all special characters left)
        df["title"] = df["title"].str.replace('[^\w\s]|\\r','')
        df["authors"] = df["authors"].str.replace('[^\w\s]|\\r','')
        # Uniformize missing values
        df["authors"] = df["authors"].str.replace('not available','')
        df["authors"] = df["authors"].replace(r'^\s*$', np.nan, regex=True)
        # Remove doubled whitespaces
        df.authors = df.authors.replace('\s+', ' ', regex=True)
        df.title = df.title.replace('\s+', ' ', regex=True)
        # Remove single characters in authors names
        df.authors = df.authors.replace(r'\b\w\b', ' ', regex=True)

In [11]:
clean(df)

In [14]:
df.describe()

Unnamed: 0,source,isbn,title,authors
count,33971,33971,33971,33227
unique,894,1265,7195,7222
top,a1books,321263588,computer networking and the internet,knuth donald
freq,2403,159,108,133


In [24]:
sum(df.duplicated())

7508

In [33]:
df = df.drop_duplicates()

# Data Quality Metrics

### Completeness
Attribute/Table completeness

NOTE: Closed World assumption

In [35]:
# Null values
null_count = df.isnull().sum()
print(null_count)

source       0
isbn         0
title        0
authors    602
dtype: int64


In [36]:
# Attribute completeness
rows = df.shape[0]
attributes = list(df)
attribute_compl = {}

for attribute in attributes:
    attribute_compl[attribute] = 1 - (null_count[attribute] / rows)

attribute_compl

{'source': 1.0, 'isbn': 1.0, 'title': 1.0, 'authors': 0.9772512564712995}

In [37]:
# Table completeness
table_compl = 1 - (null_count.sum() / (len(attributes)*rows))
print(table_compl)

0.9943128141178249


### Accuracy
For each source, matching of ISBN and authors list to Golden Standard

In [38]:
gs = pd.read_csv("./data/book_golden.txt", 
                   sep='\t', 
                   lineterminator='\n', 
                   header=None, 
                   names=["isbn", "authors"])
clean(gs, gs=True)
gs

Unnamed: 0,isbn,authors
0,9780073516677,oleary timothy oleary linda
1,9780072999389,yacht carol crosson susan
2,9780072922363,hutton david
3,9780072843996,haag stephen perry james sosinsky barrie est...
4,9780072466850,reddy
...,...,...
95,0122290631,eberly david
96,0120887975,celko joe
97,0120455994,aiken peter allen david
98,1555581994,cheek matthew


In [39]:
def compute_accuracy(source, gs):
    # Computes source accuracy by matching the golden standard rows to the source df, 
    # computing ISBN and Authors accuracy
        
    isbn_match_count = 0
    authors_match_count = 0
    authors_match_percentage = 0
    
    for gs_isbn, gs_authors in zip(gs.isbn, gs.authors):
        select = source[source.isbn == gs_isbn]
        select = select.dropna()
        if not select.empty:
            isbn_match_count += 1
            for authors in select.authors:
                tokens = authors.split()
                gs_tokens = gs_authors.split()
                match = fuzz.token_sort_ratio(tokens, gs_tokens)
                authors_match_percentage += match
                authors_match_count += 1
                
    acc = {
        "isbn": isbn_match_count/len(source.isbn),
        "authors": authors_match_percentage/authors_match_count
    }
    
    return acc

In [13]:
# Unique sources
sources = df.source.unique()

source_accuracy = []

if not os.path.exists("./data/source_accuracy.csv"):
    for source in sources:
        selected_source = df[df.source == source]
        acc = compute_accuracy(df, gs)
        acc["source"] = source
        source_accuracy.append(acc)
    
    with open('data/source_accuracy.csv', 'w') as f:
        f.write("{}, {}, {}".format("source", "isbn coverage", "authors accuracy"))
        for e in source_accuracy:
            f.write("{}, {}, {}".format(e["source"], e["isbn"], e["authors"]))

### Coverage
How much data of the dataset each source covers

In [15]:
source_coverage = {}
unique_isbn = len(df.isbn.unique())

for source in sources:
    selected_source = df[df.source == source]
    source_coverage[source] = len(selected_source.isbn.unique()) / unique_isbn

print(unique_isbn)
print(source_coverage)

1265
{'ecampus.com': 0.23873517786561266, 'indoo.com': 0.5154150197628459, 'textbookxdotcom': 0.6, 'a1books': 0.8656126482213439, 'textbooksnow': 0.2656126482213439, 'paperbackworld.de': 0.4608695652173913, 'caiman': 0.8102766798418972, 'movies with a smile': 0.108300395256917, 'marando.de versandbuchhandlung': 0.15889328063241107, 'aha-buch': 0.2458498023715415, 'versandantiquariat robert a. mueller': 0.32885375494071145, 'morgenstundt buch & kunst': 0.2276679841897233, 'alphacraze.com': 0.1241106719367589, 'odeon books': 0.1541501976284585, "powell's  books": 0.4324110671936759, 'quartermelon': 0.5154150197628459, 'browns books': 0.27509881422924903, 'books down under': 0.36996047430830037, 'swoop': 0.07509881422924901, 'thebookcom': 0.3739130434782609, 'thesaintbookstore': 0.25375494071146243, "mellon's books": 0.21818181818181817, 'blackwell online': 0.3264822134387352, 'stratford books': 0.4276679841897233, 'the book depository': 0.36363636363636365, 'englishbookservice.com gti gm

# Data Fusion

## TruthFinder implementation

In [53]:
df["trustworthiness"] = 0.9
df["fact_confidence"] = 0

In [69]:
for obj in df["isbn"].unique():
    idx = df["isbn"] == obj
    data = df[idx]
    print(data)

                                     source        isbn  \
0                               ecampus.com  0201853949   
1                                 indoo.com  0201853949   
2                           textbookxdotcom  0201853949   
3                                   a1books  0201853949   
4                              textbooksnow  0201853949   
5                         paperbackworld.de  0201853949   
6                                    caiman  0201853949   
7                       movies with a smile  0201853949   
8            marando.de versandbuchhandlung  0201853949   
9                                  aha-buch  0201853949   
10     versandantiquariat robert a. mueller  0201853949   
11                morgenstundt buch & kunst  0201853949   
12                           alphacraze.com  0201853949   
13                              odeon books  0201853949   
14                          powell's  books  0201853949   
15                             quartermelon  0201853949 

                               source        isbn  \
1132                brandnewtextbooks  0321197860   
1134               textbooks for less  0321197860   
1135                           htbook  0321197860   
1138             california textbooks  0321197860   
1139                      sandy chong  0321197860   
1140                       alinonline  0321197860   
1142                  bookworms, inc.  0321197860   
1143           collegebooksdirect.com  0321197860   
1144                     textbooksnow  0321197860   
1147                        indoo.com  0321197860   
1148                          a1books  0321197860   
1149                  textbookxdotcom  0321197860   
1150                           caiman  0321197860   
1151                       bobs books  0321197860   
1152                       dvd legacy  0321197860   
1153                    players quest  0321197860   
1155                       bobs books  0321197860   
1156                thesaintbookstore  0321197

                                    source        isbn  \
2921                    best bargain books  0201657961   
2922                             owlsbooks  0201657961   
2923               nationwide book traders  0201657961   
2924               nationwide book traders  0201657961   
2925          bookbyte / norwest textbooks  0201657961   
2926                  college book service  0201657961   
2927                      bunches of books  0201657961   
2928                           ecampus.com  0201657961   
2929                   price cut books ltd  0201657961   
2931                        monarchy books  0201657961   
2932                               a1books  0201657961   
2933                       textbookxdotcom  0201657961   
2934                mediasell (g.i.v. mbh)  0201657961   
2935                     paperbackworld.de  0201657961   
2936  versandantiquariat robert a. mueller  0201657961   
2937             morgenstundt buch & kunst  0201657961   
2938          

                            source        isbn  \
4231                   lucky books  0201710145   
4232                brian t stoval  0201710145   
4233            best bargain books  0201710145   
4234  bookbyte / norwest textbooks  0201710145   
4237              bunches of books  0201710145   
4238               oak creek books  0201710145   
4239             blumenkraft books  0201710145   
4240                   tranceworks  0201710145   
4241                  samkat books  0201710145   
4242                   ecampus.com  0201710145   
4243                     indoo.com  0201710145   
4246               textbookxdotcom  0201710145   
4247                bookmantra.com  0201710145   
4249                opoe-abe books  0201710145   
4250                       a1books  0201710145   
4252                  gunars store  0201710145   
4253                 gunter koppon  0201710145   
4254            limelight bookshop  0201710145   

                                              tit

                    source        isbn  \
5549           louis morin  0201548879   
5550          textbooksnow  0201548879   
5551        bookmantra.com  0201548879   
5552  www.textbooksrus.com  0201548879   
5553             indoo.com  0201548879   
5554       textbookxdotcom  0201548879   
5555         papamedia.com  0201548879   
5556     revaluation books  0201548879   

                                                 title  \
5549  osp an environment for operating system projects   
5550             osp environment for operating systems   
5551  osp an environment for operating system projects   
5552  osp an environment for operating system projects   
5553  osp an environment for operating system projects   
5554  osp an environment for operating system projects   
5555  osp an environment for operating system projects   
5556  osp an environment for operating system projects   

                           authors  trustworthiness  fact_confidence  
5549  kifer michael smolka 

                               source        isbn  \
7216               best bargain books  0201379368   
7217               more books, please  0201379368   
7218                   bookmantra.com  0201379368   
7219                 balkanika online  0201379368   
7220                        indoo.com  0201379368   
7221                           caiman  0201379368   
7222                 paperbackshop-us  0201379368   
7223                          a1books  0201379368   
7224                  annex books inc  0201379368   
7225                 blackwell online  0201379368   
7226                    papamedia.com  0201379368   
7227  englishbookservice.com gti gmbh  0201379368   
7228                paperbackworld.de  0201379368   
7229                revaluation books  0201379368   
7230               limelight bookshop  0201379368   

                                                  title         authors  \
7216  essential wininet developing applications usin...  aaron skonnard   
7

                                       source        isbn  \
8342                    bargainbookstores.com  0321356705   
8345                         bunches of books  0321356705   
8346                       best bargain books  0321356705   
8347                              ecampus.com  0321356705   
8350                             textbooksnow  0321356705   
8351                                indoo.com  0321356705   
8352                                  a1books  0321356705   
8353                      movies with a smile  0321356705   
8354                                   caiman  0321356705   
8355                          textbookxdotcom  0321356705   
8358                           alphacraze.com  0321356705   
8360                              bookholders  0321356705   
8361                         paperbackshop-us  0321356705   
8362                               dvd legacy  0321356705   
8363                            players quest  0321356705   
8364                    

                                       source        isbn  \
9511                                indoo.com  0201615983   
9512                       lds heritage books  0201615983   
9513                                  a1books  0201615983   
9514                          textbookxdotcom  0201615983   
9515                        paperbackworld.de  0201615983   
9516                       books2anywhere.com  0201615983   
9517                                   caiman  0201615983   
9518                       cobu gmbh & co. kg  0201615983   
9519     versandantiquariat robert a. mueller  0201615983   
9520                                 aha-buch  0201615983   
9521           marando.de versandbuchhandlung  0201615983   
9522                morgenstundt buch & kunst  0201615983   
9523                    ensight book services  0201615983   
9524                               thebookcom  0201615983   
9526                           mellon's books  0201615983   
9527                    

                      source        isbn  \
10875  bargainbookstores.com  0201379511   
10876     best bargain books  0201379511   
10877           beagle books  0201379511   
10879              owlsbooks  0201379511   
10880                a1books  0201379511   
10881       paperbackshop-us  0201379511   
10882                 caiman  0201379511   
10883         bookmantra.com  0201379511   
10885      thesaintbookstore  0201379511   
10887          papamedia.com  0201379511   
10888     monroe vista group  0201379511   
10889           quartermelon  0201379511   
10890      revaluation books  0201379511   
10891        stratford books  0201379511   
10892     limelight bookshop  0201379511   

                                          title                      authors  \
10875  bgp4 interdomain routing in the internet               john   stewart   
10876  bgp4 interdomain routing in the internet               stewart john     
10877  bgp4 interdomain routing in the internet        

                  source        isbn  \
11918       textbooksnow  0201751585   
11919     opoe-abe books  0201751585   
11920       browns books  0201751585   
11921          indoo.com  0201751585   
11922            a1books  0201751585   
11923    textbookxdotcom  0201751585   
11924       quartermelon  0201751585   
11925    stratford books  0201751585   
11926  revaluation books  0201751585   
11927  thesaintbookstore  0201751585   

                                                   title  \
11918                                 java by dissection   
11919                        java by dissectionwc primer   
11920                                 java by dissection   
11921  java by dissection the essentials of java prog...   
11922  java by dissection the essentials of java prog...   
11923  java by dissection the essentials of java prog...   
11924  java by dissection the essentials of java prog...   
11925  java by dissection the essentials of java prog...   
11926  java by diss

                   source        isbn  \
13070               avept  0201705028   
13071    bunches of books  0201705028   
13072         ecampus.com  0201705028   
13073      luv them books  0201705028   
13074      bookmantra.com  0201705028   
13075     textbookxdotcom  0201705028   
13076              caiman  0201705028   
13077             a1books  0201705028   
13078    paperbackshop-us  0201705028   
13079       papamedia.com  0201705028   
13080        quartermelon  0201705028   
13081   revaluation books  0201705028   
13082     stratford books  0201705028   
13083  limelight bookshop  0201705028   

                                                   title  \
13070  jndi api tutorial and reference building direc...   
13071  jndi api tutorial and reference building direc...   
13072  the jndi api tutorial and reference building d...   
13073  jndi api tutorial and reference building direc...   
13074  jndi tutorial and reference guide foundations ...   
13075  the jndi api tuto

                                     source        isbn  \
13941                 bargainbookstores.com  0321268636   
13942                              gotbooks  0321268636   
13943                    best bargain books  0321268636   
13944                collegebooksdirect.com  0321268636   
13946                      bunches of books  0321268636   
13948                                 the e  0321268636   
13949                             happybook  0321268636   
13950                           ecampus.com  0321268636   
13952                         mildredsbooks  0321268636   
13953                             indoo.com  0321268636   
13954                                caiman  0321268636   
13955                       textbookxdotcom  0321268636   
13956                               a1books  0321268636   
13957                gulls nest books, inc.  0321268636   
13958                       powell's  books  0321268636   
13959                        opoe-abe books  0321268636 

                                        source        isbn  \
15726                             textbooksnow  0321269675   
15727                       best bargain books  0321269675   
15728                                indoo.com  0321269675   
15729                                   caiman  0321269675   
15730                            ashleyjohnson  0321269675   
15731                          powell's  books  0321269675   
15732                                  a1books  0321269675   
15733                          textbookxdotcom  0321269675   
15734                      movies with a smile  0321269675   
15736                            players quest  0321269675   
15737                           mellon's books  0321269675   
15738                               thebookcom  0321269675   
15740                         paperbackshop-us  0321269675   
15741                morgenstundt buch & kunst  0321269675   
15742     versandantiquariat robert a. mueller  0321269675   
15744   

                        source        isbn                         title  \
17011     jr trading/bookjoint  0201745763  pair programming illuminated   
17012       best bargain books  0201745763  pair programming illuminated   
17013  shirley's book services  0201745763  pair programming illuminated   
17014            aaa textbooks  0201745763  pair programming illuminated   
17015         bunches of books  0201745763  pair programming illuminated   
17016                  kitabay  0201745763  pair programming illuminated   
17017              ecampus.com  0201745763  pair programming illuminated   
17019           opoe-abe books  0201745763  pair programming illuminated   
17020          textbookxdotcom  0201745763  pair programming illuminated   
17021            papamedia.com  0201745763  pair programming illuminated   
17022        revaluation books  0201745763  pair programming illuminated   
17023       limelight bookshop  0201745763  pair programming illuminated   

           

                                 source           isbn  \
18151  d & a worldwide textbook service  9780073101569   
18153       covenant international inc.  9780073101569   
18154                         jessebook  9780073101569   
18155                     sunmark store  9780073101569   
18156              www.textbooksrus.com  9780073101569   
18157                            htbook  9780073101569   
18160                       jefeducator  9780073101569   
18161                   lgtextbooks.com  9780073101569   
18162                    save_you_a_lot  9780073101569   
18164                        rk sellers  9780073101569   
18165            studentbookstoreglobal  9780073101569   
18166                       sandy chong  9780073101569   
18167                    avidbookseller  9780073101569   
18168                        bobs books  9780073101569   
18169                  books down under  9780073101569   
18171                      textbooksnow  9780073101569   
18175      boo

                  source           isbn  \
19245          indoo.com  9780072971293   
19246            a1books  9780072971293   
19247        ecampus.com  9780072971293   
19250    textbookxdotcom  9780072971293   
19251      papamedia.com  9780072971293   
19252  revaluation books  9780072971293   

                                         title            authors  \
19245  simnet for office 2003 standard edition  triad interactive   
19246  simnet for office 2003 standard edition  triad interactive   
19247         simnet for office 2003 comb stnd                NaN   
19250  simnet for office 2003 standard edition  triad interactive   
19251  simnet for office 2003 standard edition  triad interactive   
19252  simnet for office 2003 standard edition  triad interactive   

       trustworthiness  fact_confidence  
19245              0.9                0  
19246              0.9                0  
19247              0.9                0  
19250              0.9                0  
1925

                                        source           isbn  \
20452                             browns books  9780072261943   
20453                       the book den, abaa  9780072261943   
20454                                indoo.com  9780072261943   
20455                                   caiman  9780072261943   
20457                      the book depository  9780072261943   
20458                                  a1books  9780072261943   
20459                          textbookxdotcom  9780072261943   
20460                             quartermelon  9780072261943   
20461                        thesaintbookstore  9780072261943   
20462                               thebookcom  9780072261943   
20463                        kayleighbug books  9780072261943   
20464                              kayleighbug  9780072261943   
20465                                    swoop  9780072261943   
20466                            papamedia.com  9780072261943   
20468                    

                                     source        isbn  \
21417                     the bookman, inc.  155558280X   
21418                          sandra gudac  155558280X   
21419                       powell's  books  155558280X   
21420       vielbuch.de - onlineantiquariat  155558280X   
21422                               a1books  155558280X   
21423                           boox2relyon  155558280X   
21424  versandantiquariat robert a. mueller  155558280X   
21425                         papamedia.com  155558280X   
21426                     revaluation books  155558280X   
21427                    limelight bookshop  155558280X   

                                                   title       authors  \
21417  corporate portals empowered with xml and web s...  guruge anura   
21418  corporate portals empowered with xml and web s...        guruge   
21419             corporate portals empowered with xml w  guruge anura   
21420  corporate portals empowered with xml and web s.

                                     source        isbn  \
22523                               a1books  0120884828   
22525                                caiman  0120884828   
22526                      paperbackshop-us  0120884828   
22527                    books2anywhere.com  0120884828   
22528                       powell's  books  0120884828   
22529                       powell's  books  0120884828   
22530                      blackwell online  0120884828   
22531                          quartermelon  0120884828   
22532                          browns books  0120884828   
22533                          quartermelon  0120884828   
22534                        lakeside books  0120884828   
22536  versandantiquariat robert a. mueller  0120884828   
22537                     paperbackworld.de  0120884828   
22538                   the book depository  0120884828   
22540                            bobs books  0120884828   
22541                      books down under  0120884828 

                                source        isbn  \
23625                the bookman, inc.  155860779X   
23626                          a1books  155860779X   
23627        silicon valley fine books  155860779X   
23629        dotcom liquidators / dc 1  155860779X   
23630  vielbuch.de - onlineantiquariat  155860779X   
23631                      boox2relyon  155860779X   
23632             sarl culture-factory  155860779X   
23633              the book depository  155860779X   
23634                paperbackworld.de  155860779X   
23635                    papamedia.com  155860779X   
23636                revaluation books  155860779X   

                                                   title  \
23625  developing ipbased services solutions for serv...   
23626  developing ipbased services solutions for serv...   
23627  developing ipbased services solutions for serv...   
23629  developing ipbased services solutions for serv...   
23630  developing ip based services hardcover by mo

                                     source        isbn  \
24869                             livrenoir  1558604820   
24870                        bookmantra.com  1558604820   
24871                                caiman  1558604820   
24872                               a1books  1558604820   
24874                       book lovers usa  1558604820   
24876                       annex books inc  1558604820   
24877                    cobu gmbh & co. kg  1558604820   
24878                         papamedia.com  1558604820   
24879                              aha-buch  1558604820   
24880  versandantiquariat robert a. mueller  1558604820   
24881                     paperbackworld.de  1558604820   
24882                     revaluation books  1558604820   
24883                    limelight bookshop  1558604820   

                                                   title              authors  \
24869  a complete guide to db2 universal database ibm...    chamberlin donald   
24870  a co

                                     source        isbn  \
25659                      mega media depot  0125184069   
25660                               a1books  0125184069   
25661                   movies with a smile  0125184069   
25662                                caiman  0125184069   
25664                        bookmantra.com  0125184069   
25665                        alphacraze.com  0125184069   
25668                          quartermelon  0125184069   
25669                          browns books  0125184069   
25671                  college book service  0125184069   
25672                       book lovers usa  0125184069   
25673           the critical eye used books  0125184069   
25674                     henry's biz books  0125184069   
25675                       annex books inc  0125184069   
25676                       stratford books  0125184069   
25677                          quartermelon  0125184069   
25678                          quartermelon  0125184069 

KeyboardInterrupt: 