In [1]:
from pymongo import MongoClient
from thefuzz import fuzz, process
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from unidecode import unidecode

In [2]:
client = MongoClient()
scopus = client["scopus_colombia"]["stage"]
openalex = client["openalexco"]["works"]

In [6]:
openalexdata = list(openalex.find({"doi":{"$ne":None},"title":{"$ne":None}},{"title":1,"doi":1,"publication_year":1,"host_venue":1}))

In [35]:
def parse_string(text):
    text = unidecode(text.lower())
    text = re.sub( r'[\$_\^]','', re.sub(r'\\\w+','',text ))
    return str(text)

def __colav_similarity(title1,title2,journal1,journal2,year1,year2, ratio_thold=90, partial_thold=95,low_thold=80,verbose=0):

    label = False
    
    #Se revisa si los años y las revistas coinciden
    journal_check = False
    if journal1 and journal2:
        if fuzz.partial_ratio(unidecode(journal1.lower()),unidecode(journal2.lower()))>ratio_thold:
            journal_check=True
    year_check=False
    if year1 and year2:
        if year1==year2:
            year_check = True
    
    length_check=False
    if len(title1.split())>3 and len(title2.split())>3:
        length_check=True
        
    #Si son pocas palabras y no hay por lo menos revista o año para revisar, se descarta de uan vez
    if length_check == False and (journal_check == False or year_check == False):
        return label
    
    if verbose==5:
        if journal_check: print("Journals are the same")
        if year_check: print("Years are the same")
    
    ratio = fuzz.ratio(title1, title2)
    if verbose==5: print("Initial ratio: ",ratio)
    if ratio > ratio_thold and length_check: #Comparación "directa"
        label = True
    if label == False:
        #Comparaciones cuando el título viene en varios idiomas
        title1_list=title1.split("[")
        title2_list=title2.split("[")
        if min([len(item) for item in title1_list]) > 10 and min([len(item) for item in title2_list]) > 10:
            for title in title1_list:
                tmp_title,ratio=process.extractOne(title,title2_list,scorer=fuzz.ratio)
                if ratio > ratio_thold:
                    label=True
                    break
            #if verbose==5: print("ratio over list: ",ratio)
            if label==False:
                for title in title1_list:
                    tmp_title,ratio=process.extractOne(title,title2_list,scorer=fuzz.partial_ratio)
                    if ratio > partial_thold:
                        label=True
                        break
                    elif ratio > low_thold:
                        if journal_check and year_check:
                            label=True
                            break
                #if verbose==5: print("partial ratio over list: ",ratio)
    
    #Partial ratio section
    if label == False:
        ratio = fuzz.partial_ratio(title1, title2) #Cuando la comparación "directa" falla, relajamos el scorer
        #if verbose==5: print("partial ratio: ",ratio)
                
        if ratio > partial_thold and length_check: #si el score supera el umbral (que debería ser mayor al umbral del ratio)
            label=True
        elif ratio > low_thold: #si no lo supera pero sigue siendo un valor alto, revisa el año y la revista
            if journal_check and year_check:
                label=True
             
    return label

def colav_similarity(title1,title2,journal1,journal2,year1,year2,ratio_thold=90, partial_thold=95, low_thold=80,use_regex=True):
    '''
    Compare two papers to know if they are the same or not.

    It uses the title, year and journal names of both papers to compare them in an somewhat elaborated way. Titles are compared using various algorithms of string comparison from thefuzz library with diferent levels of tolerance if its within a range determined by some threshold variables. The years must be the same, though we've seen an erro of +/- one year. Journal names are also compared using thefuzz's algorithms
    
    parameters
    ----------
        title1: str title of one of the papers
        title2: str title of the other paper
        journal1: str name of the journal in which one of the papers was published
        journal2: str name of the journal in which the other paper was published
        year1: int year in which one of the papers was published
        year2: int year in which the other paper was published
        ratio_thold: int threshold to compare through ratio function in thefuzz library
        partial_ratio_thold: int threshold to compare throughpartial_ratio function in thefuzz library
        low_thold: int threshold to discard some results with lower score values
        use_regex: bool Uses a regex to clean the titles
        
    Returns
    -------
    label: bool true when the papers are (potentially) the same.
    '''
    title1 = unidecode(title1.lower())
    title2 = unidecode(title2.lower())
    
    if year1: year1=int(year1)
    if year2: year2=int(year2)
    
    label = False
    
    if not use_regex:
        label = __colav_similarity(title1,title2,journal1,journal2, year1,year2,ratio_thold, partial_thold,low_thold,translation=translate)
    elif use_regex:
        label = __colav_similarity(parse_string(title1), parse_string(title2),journal1,journal2, year1,year2,ratio_thold, partial_thold,low_thold)
    return label

In [31]:
def compare_mongo(oa_reg,title_comparison="direct",verbose=0):
    '''
    Takes a openalex's register and compares it to a scopus register to report if its the same paper or not.

    It takes an openalex register and finds a match through a text search in Scopus' database with the openalex's title. Then, it compares the titles, years and journals' names (depending on the chosen algorithm) to report its contribution to the confusion matrix

    Parameters
    ----------
        oa_reg: dict OpenAlex register
        title_comparison: an algorithm to compare the two registers (direct, ratio,token_set_ratio, colav)
        verbose: the quantity of messages that the function will print on screen
        
    Returns
    -------
        conf: list The contribution of the comparison to a matrix of confusion
        
    '''
    oa_doi = oa_reg["doi"].replace("https://doi.org/","").lower()
    conf=[0,0,0,0] # [vp,fp,vn,fn]
    #client = MongoClient()
    #scopus = client["scopus_colombia"]["stage"]
    scopus_reg = list(scopus.find({"$text":{"$search":oa_reg["title"].lower(),"$caseSensitive":False},"DOI":{"$ne":None}}).limit(1))
    if scopus_reg:
        scopus_reg = scopus_reg[0]
        if "DOI" in scopus_reg.keys():
            if isinstance(scopus_reg["DOI"],str):
                try:
                    comparison=None
                    #if fuzz.ratio(scopus_reg["Title"].lower(),oa_reg["title"].lower())>99:
                    if title_comparison=="direct":
                        comparison=scopus_reg["Title"].lower()==oa_reg["title"].lower()
                    elif title_comparison=="ratio":
                        comparison=fuzz.ratio(scopus_reg["Title"].lower(),oa_reg["title"].lower())==100
                    elif title_comparison=="token_set_ratio":
                        comparison=fuzz.token_set_ratio(scopus_reg["Title"].lower(),oa_reg["title"].lower())==100
                    elif title_comparison=="colav":
                        comparison=colav_similarity(
                            oa_reg["title"],
                            scopus_reg["Title"],
                            oa_reg["host_venue"]["display_name"].lower(),
                            scopus_reg["Source title"].lower(),
                            oa_reg["publication_year"],
                            scopus_reg["Year"]
                        )
                    if comparison:
                        if oa_doi==scopus_reg["DOI"].lower():
                            conf[0]+=1
                        else:
                            conf[1]+=1
                    else:
                        if oa_doi==scopus_reg["DOI"].lower():
                            conf[3]+=1
                        else:
                            conf[2]+=1
                except Exception as e:
                    if verbose>4:
                        print(e)
                        print("\t{}\n\t{}".format(scopus_reg["Title"],oa_reg["title"]))
                        print("\t{}\n\t{}".format(oa_reg["doi"],scopus_reg["DOI"]))

    return conf

In [36]:
colav_similarity?

[0;31mSignature:[0m
[0mcolav_similarity[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mtitle1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtitle2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjournal1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mjournal2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0myear1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0myear2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mratio_thold[0m[0;34m=[0m[0;36m90[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mpartial_thold[0m[0;34m=[0m[0;36m95[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlow_thold[0m[0;34m=[0m[0;36m80[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_regex[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compare two papers to know if they are the same or not.

It uses the title, year and journal names of both papers to compare them in an somewhat elaborated way. Titles are compared using various algorithms 

In [16]:
%%timeit -n 1 -r 1
#Direct equality in the comparison of titles
# [vp,fp,vn,fn]
conf=[0,0,0,0] # [vp,fp,vn,fn]
result=Parallel(n_jobs=20,backend="threading",verbose=0)(delayed(compare_mongo)(reg,"direct") for reg in openalexdata)
for res in result:
    for i in range(len(conf)):
        conf[i]+=res[i]
print("TP: {}. \tFN: {}".format(conf[0],conf[-1]))
print("FP: {}. \tTN: {}".format(conf[1],conf[-2]))
acc=(conf[0]+conf[-2])/np.sum(conf)
print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(conf[0]/(conf[0]+conf[1])))
print("Recall: {:.2f}".format(conf[0]/(conf[0]+conf[-1])))

TP: 2245. 	FN: 2216
FP: 76. 	TN: 144402
Accuracy: 0.98
Precision: 0.97
Recall: 0.50
1min 27s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
%%timeit -n 1 -r 1
#fuzz ratio at 100
# [vp,fp,vn,fn]
#fuzz token_set_ratio at 100
conf=[0,0,0,0] # [vp,fp,vn,fn]
result=Parallel(n_jobs=20,backend="threading",verbose=0)(delayed(compare_mongo)(reg,"ratio") for reg in openalexdata)
for res in result:
    for i in range(len(conf)):
        conf[i]+=res[i]
print("TP: {}. \tFN: {}".format(conf[0],conf[-1]))
print("FP: {}. \tTN: {}".format(conf[1],conf[-2]))
acc=(conf[0]+conf[-2])/np.sum(conf)
print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(conf[0]/(conf[0]+conf[1])))
print("Recall: {:.2f}".format(conf[0]/(conf[0]+conf[-1])))

TP: 2431. 	FN: 2030
FP: 76. 	TN: 144402
Accuracy: 0.99
Precision: 0.97
Recall: 0.54
1min 28s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
%%timeit -n 1 -r 1
#fuzz token_set_ratio at 100
conf=[0,0,0,0] # [vp,fp,vn,fn]
result=Parallel(n_jobs=20,backend="threading",verbose=0)(delayed(compare_mongo)(reg,"token_set_ratio") for reg in openalexdata)
for res in result:
    for i in range(len(conf)):
        conf[i]+=res[i]
print("TP: {}. \tFN: {}".format(conf[0],conf[-1]))
print("FP: {}. \tTN: {}".format(conf[1],conf[-2]))
acc=(conf[0]+conf[-2])/np.sum(conf)
print("Accuracy: {:.2f}".format(acc))
print("Precision: {:.2f}".format(conf[0]/(conf[0]+conf[1])))
print("Recall: {:.2f}".format(conf[0]/(conf[0]+conf[-1])))

TP: 4301. 	FN: 160
FP: 865. 	TN: 143613
Accuracy: 0.99
Precision: 0.83
Recall: 0.96
1min 32s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [23]:
%%timeit -n 1 -r 1
conf=[0,0,0,0] # [vp,fp,vn,fn]
result=Parallel(n_jobs=20,backend="threading",verbose=0)(delayed(compare_mongo)(reg,"colav") for reg in openalexdata)
for res in result:
    for i in range(len(conf)):
        conf[i]+=res[i]
print("TP: {}. \tFN: {}".format(conf[0],conf[-1]))
print("FP: {}. \tTN: {}".format(conf[1],conf[-2]))
acc=(conf[0]+conf[-2])/np.sum(conf)
print("Accuracy: {:.3f}".format(acc))
print("Precision: {:.3f}".format(conf[0]/(conf[0]+conf[1])))
print("Recall: {:.3f}".format(conf[0]/(conf[0]+conf[-1])))

TP: 4368. 	FN: 33
FP: 449. 	TN: 140493
Accuracy: 0.997
Precision: 0.907
Recall: 0.993
1min 55s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
