In [1]:
from mohan.Similarity import Similarity
from mohan.ColavSimilarity import parse_doi, parse_string
from joblib import Parallel, delayed
import pickle
import numpy as np
from mohan.ColavSimilarity import ColavSimilarity, parse_doi, parse_string
from elasticsearch import Elasticsearch, __version__ as es_version
from sklearn import metrics
import matplotlib.pyplot as plt
from pymongo import MongoClient
import copy
from IPython.display import JSON

In [None]:
openalex = list(MongoClient()["openalexco"]["works"].find({'doi': {"$ne": None}}))

In [21]:

from mohan.ColavSimilarity import ColavSimilarity, parse_string
from elasticsearch import Elasticsearch, __version__ as es_version
from elasticsearch.helpers import bulk


class Similarity:
    def __init__(self, es_index, es_uri: str = "http://localhost:9200",
                 es_auth: tuple = ('elastic', 'colav'),
                 es_req_timeout: int = 120):
        """
        Initialize the Similarity class.
        Parameters:
        -----------
        es_index: str 
                name of the index
        es_uri: str 
                uri of the elastic search server
        es_auth: tuple 
                authentication for the elastic search server
        es_req_timeout: int 
                elastic search request timeout
        """
        auth = es_auth
        if es_version[0] < 8:
            self.es = Elasticsearch(
                es_uri, http_auth=auth, timeout=es_req_timeout)
        else:
            self.es = Elasticsearch(
                es_uri, basic_auth=auth, timeout=es_req_timeout)
        self.es_index = es_index
        self.es_req_timeout = es_req_timeout

    def ensure_index(self, mapping: dict = None, recreate: bool = False):
        """
        Create an index.
        Parameters:
        -----------
        index_name: str name of the index
        mapping: dict mapping of the index
        recreate: bool whether to recreate the index or not

        """
        if recreate:
            if self.es.indices.exists(index=self.es_index):
                self.delete_index(self.es_index)
        if mapping:
            self.es.indices.create(index=self.es_index, body=mapping)
        else:
            self.es.indices.create(index=self.es_index)

    def delete_index(self, index_name: str):
        """
        Delete an index.
        Parameters:
        -----------
        index_name: str name of the index
        """
        self.es.indices.delete(index=index_name)

    def insert_work(self, _id: str, work: dict):
        """
        Insert a work into the index.
        work should have a dict structure like the next one.
        work = {"title": "title of the work",
                "authors": "authors of the work",
                "source": "source of the work",
                "year": "year of the work",
                "volume": "volume of the work",
                "issue": "issue of the work",
                "page_start": "page start of the work",
                "page_end": "page end of the work"}
        every value is a string, including the year, volume, issue, page_start and page_end.

        Additional fields such as doi, pmid, pmcid, etc. can be added to the work dict if needed,
        but the search is over the previous fields.

        Parameters:
        -----------
        _id: str id of the work (ex: mongodb id as string)
        work: dict work to be inserted
        """
        return self.es.index(index=self.es_index,  id=_id, document=work)

    def search_work(self, title: str, source: str, year: str,
                    volume: str, issue: str, page_start: str, page_end: str, 
                    use_es_thold: bool = False, es_thold_low: int = 0, es_thold_high: int = 120,
                    ratio_thold: int = 90, partial_thold: int = 95, low_thold: int = 80, parse_title: bool = True):
        """
        Compare two papers to know if they are the same or not.
        Parameters:
        -----------
        title: str 
                title of the paper
        source: str 
                name of the journal in which the paper was published
        year: int 
                year in which the paper was published
        volume: int 
                volume of the journal in which the paper was published
        issue: int 
                issue of the journal in which the paper was published
        page_start: int 
                first page of the paper
        page_end: int 
                last page of the paper
        use_es_thold: bool
                whether to use the elastic search score threshold or not
        es_thold_low: int
                elastic search score threshold to discard some results with lower score values
        es_thold_high: int
                elastic search score threshold to return the best hit
        ratio_thold: int 
                threshold to compare through ratio function in thefuzz library
        partial_ratio_thold: int 
                threshold to compare through partial_ratio function in thefuzz library
        low_thold: int
                threshold to discard some results with lower score values
        es_request_timeout: int
                elastic search request timeout
        parse_title: bool
                whether to parse the title or not (parse title helps to improve the results)

        Returns:
        --------
        record: dict when the papers are (potentially) the same, None otherwise.
        """
        if not isinstance(title, str):
            title = ""

        if not isinstance(source, str):
            source = ""

        if isinstance(volume, int):
            volume = str(volume)

        if isinstance(issue, int):
            issue = str(issue)

        if isinstance(page_start, int):
            page_start = str(page_start)

        if isinstance(page_end, int):
            page_end = str(page_end)

        if not isinstance(volume, str):
            volume = ""

        if not isinstance(issue, str):
            issue = ""

        if not isinstance(page_start, str):
            page_start = ""

        if not isinstance(page_end, str):
            page_end = ""
        if parse_title:
            title = parse_string(title)
        # body = {
        #     "query": {
        #         "bool": {
        #             "should": [
        #                 {"match": {"title": {"query":  title,"boost": 0.9}}},
        #                 {"match": {"source":  source}},
        #                 {"term":  {"year": year}},
        #                 {"term":  {"volume": volume}},
        #                 {"term":  {"issue": issue}},
        #                 {"term":  {"page_start": page_start}},
        #                 {"term":  {"page_end": page_end}},
        #             ],
        #         }
        #     },
        #     "size": 20,
        # }
        body = {
            "query": {
                "bool": {
                    "should": [
                        {"match": { "title":  {
                                 "query": title,
                                 "operator": "OR"
                              }}},
                        #{"match": {"title": {"query":  title,"boost": 1}}},
                        # {"match": {"source":  source}},
                        { "match": { "source":  {
                                 "query": source,
                                 "operator": "AND"
                              }}},
                        {"term":  {"year": year}},
                        {"term":  {"volume": volume}},
                        {"term":  {"issue": issue}},
                        {"term":  {"page_start": page_start}},
                        {"term":  {"page_end": page_end}},
                    ]
                }
            },
            "size": 20
        }
        res = self.es.search(index=self.es_index, **body)
        if res["hits"]["total"]["value"] != 0:
            best_hit = res["hits"]["hits"][0]
            if use_es_thold:
                if best_hit["_score"] >= es_thold_high:
                    return best_hit

            # for i in res["hits"]["hits"]:
            #     value = ColavSimilarity(title, i["_source"]["title"],
            #                             source, i["_source"]["source"],
            #                             year, i["_source"]["year"],
            #                             ratio_thold=ratio_thold, partial_thold=partial_thold, low_thold=low_thold)
            #     if value:
            #         return i
            return None
        else:
            return None

    def insert_bulk(self, entries: list, refresh=True):
        """
        Insert a bulk of works into the index.
        Parameters:
        -----------
        entries: list 
                list of works to be inserted
        """
        return bulk(self.es, entries, index=self.es_index, refresh=refresh, request_timeout=self.es_req_timeout)


In [22]:
s = Similarity("openalex_scratch",es_uri="172.19.31.8")

In [39]:
s.ensure_index(recreate=True)
def search(i):
    title = parse_string(i["title"])
    source = i["host_venue"]["display_name"]
    year = i["publication_year"]
    volume = i["biblio"]["volume"] 
    issue = i["biblio"]["issue"] 
    first_page = i["biblio"]["first_page"]
    last_page = i["biblio"]["last_page"]
    
    res = s.search_work(title,source,year,volume,issue,first_page,last_page,
                        use_es_thold = True, es_thold_low = 0, es_thold_high = 150)
    if res is not None:
        return (i,res)
    else:
        work = {}
        work["title"] = parse_string(i["title"])
        work["source"] = i["host_venue"]["display_name"]
        work["year"] = i["publication_year"]
        work["volume"] = i["biblio"]["volume"] 
        work["issue"] = i["biblio"]["issue"] 
        work["first_page"] = i["biblio"]["first_page"]
        work["last_page"] = i["biblio"]["last_page"]
        work["doi"] = i["doi"]
        _id = str(i["_id"])
        s.insert_work(_id,work)
        return None

In [40]:
results = Parallel(n_jobs=72,backend='threading',verbose=2)(delayed(search)(i) for i in openalex)

[Parallel(n_jobs=72)]: Using backend ThreadingBackend with 72 concurrent workers.
[Parallel(n_jobs=72)]: Done  18 tasks      | elapsed:    0.4s
[Parallel(n_jobs=72)]: Done 221 tasks      | elapsed:    1.0s
[Parallel(n_jobs=72)]: Done 504 tasks      | elapsed:    2.0s
[Parallel(n_jobs=72)]: Done 869 tasks      | elapsed:    3.1s
[Parallel(n_jobs=72)]: Done 1314 tasks      | elapsed:    4.6s
[Parallel(n_jobs=72)]: Done 1841 tasks      | elapsed:    6.4s
[Parallel(n_jobs=72)]: Done 2448 tasks      | elapsed:    8.5s
[Parallel(n_jobs=72)]: Done 3137 tasks      | elapsed:   10.8s
[Parallel(n_jobs=72)]: Done 3906 tasks      | elapsed:   13.2s
[Parallel(n_jobs=72)]: Done 4757 tasks      | elapsed:   16.1s
[Parallel(n_jobs=72)]: Done 5688 tasks      | elapsed:   19.1s
[Parallel(n_jobs=72)]: Done 6701 tasks      | elapsed:   22.5s
[Parallel(n_jobs=72)]: Done 7794 tasks      | elapsed:   26.1s
[Parallel(n_jobs=72)]: Done 8969 tasks      | elapsed:   29.9s
[Parallel(n_jobs=72)]: Done 10224 tasks 

In [37]:
res = [i for i in results if i is not None]

In [38]:
#2655 1.302
# 294 no colav simi th = 100
print(len(res))
len(res)*100/len(openalex)

46


0.02256417299853333

In [None]:
with open('openalex_fails_noauthors_rt90_pt92_lt81.pkl', 'wb') as f:
    pickle.dump({"fails":res}, f)

In [27]:
res[0][0]["_id"] = str(res[0][0]["_id"])

In [28]:
JSON(res[0][0])

<IPython.core.display.JSON object>

In [29]:
JSON(res[0][1])

<IPython.core.display.JSON object>

In [None]:
res2 = search(res[100]["_source"])

In [None]:
JSON(res2)

In [None]:
count = 0
for i in openalex:
    if "https://doi.org/10.1007/jhep01(2011)080" == i["doi"]:
        count+=1

In [None]:
count

In [None]:
len(openalex)