In [1]:
from mohan.Similarity import Similarity
from mohan.ColavSimilarity import parse_doi, parse_string
from joblib import Parallel, delayed
import pickle
import numpy as np
from mohan.ColavSimilarity import ColavSimilarity, parse_doi, parse_string
from elasticsearch import Elasticsearch, __version__ as es_version
from sklearn import metrics
import matplotlib.pyplot as plt
import time

In [2]:

from mohan.ColavSimilarity import ColavSimilarity, parse_string
from elasticsearch import Elasticsearch, __version__ as es_version
from elasticsearch.helpers import bulk


class Similarity:
    def __init__(self, es_index, es_uri: str = "http://localhost:9200",
                 es_auth: tuple = ('elastic', 'colav'),
                 es_req_timeout: int = 120):
        """
        Initialize the Similarity class.
        Parameters:
        -----------
        es_index: str 
                name of the index
        es_uri: str 
                uri of the elastic search server
        es_auth: tuple 
                authentication for the elastic search server
        es_req_timeout: int 
                elastic search request timeout
        """
        auth = es_auth
        if es_version[0] < 8:
            self.es = Elasticsearch(
                es_uri, http_auth=auth, timeout=es_req_timeout)
        else:
            self.es = Elasticsearch(
                es_uri, basic_auth=auth, timeout=es_req_timeout)
        self.es_index = es_index
        self.es_req_timeout = es_req_timeout

    def create_index(self, mapping: dict = None, recreate: bool = False):
        """
        Create an index.
        Parameters:
        -----------
        index_name: str name of the index
        mapping: dict mapping of the index
        recreate: bool whether to recreate the index or not

        """
        if recreate:
            if self.es.indices.exists(index=self.es_index):
                self.delete_index(self.es_index)
        if mapping:
            self.es.indices.create(index=self.es_index, body=mapping)
        else:
            self.es.indices.create(index=self.es_index)

    def delete_index(self, index_name: str):
        """
        Delete an index.
        Parameters:
        -----------
        index_name: str name of the index
        """
        self.es.indices.delete(index=index_name)

    def insert_work(self, _id: str, work: dict):
        """
        Insert a work into the index.
        work should have a dict structure like the next one.
        work = {"title": "title of the work",
                "authors": "authors of the work",
                "source": "source of the work",
                "year": "year of the work",
                "volume": "volume of the work",
                "issue": "issue of the work",
                "page_start": "page start of the work",
                "page_end": "page end of the work"}
        every value is a string, including the year, volume, issue, page_start and page_end.

        Additional fields such as doi, pmid, pmcid, etc. can be added to the work dict if needed,
        but the search is over the previous fields.

        Parameters:
        -----------
        _id: str id of the work (ex: mongodb id as string)
        work: dict work to be inserted
        """
        return self.es.index(index=self.es_index,  id=_id, document=work)

    def search_work(self, title: str, source: str, year: str,
                    volume: str, issue: str, page_start: str, page_end: str, 
                    use_es_thold: bool = False, es_thold_low: int = 10, es_thold_high: int = 120,
                    ratio_thold: int = 90, partial_thold: int = 95, low_thold: int = 80, parse_title: bool = True):
        """
        Compare two papers to know if they are the same or not.
        Parameters:
        -----------
        title: str 
                title of the paper
        source: str 
                name of the journal in which the paper was published
        year: int 
                year in which the paper was published
        volume: int 
                volume of the journal in which the paper was published
        issue: int 
                issue of the journal in which the paper was published
        page_start: int 
                first page of the paper
        page_end: int 
                last page of the paper
        use_es_thold: bool
                whether to use the elastic search score threshold or not
        es_thold_low: int
                elastic search score threshold to discard some results with lower score values
        es_thold_high: int
                elastic search score threshold to return the best hit
        ratio_thold: int 
                threshold to compare through ratio function in thefuzz library
        partial_ratio_thold: int 
                threshold to compare through partial_ratio function in thefuzz library
        low_thold: int
                threshold to discard some results with lower score values
        es_request_timeout: int
                elastic search request timeout
        parse_title: bool
                whether to parse the title or not (parse title helps to improve the results)

        Returns:
        --------
        record: dict when the papers are (potentially) the same, None otherwise.
        """
        if not isinstance(title, str):
            title = ""

        if not isinstance(source, str):
            source = ""

        if isinstance(volume, int):
            volume = str(volume)

        if isinstance(issue, int):
            issue = str(issue)

        if isinstance(page_start, int):
            page_start = str(page_start)

        if isinstance(page_end, int):
            page_end = str(page_end)

        if not isinstance(volume, str):
            volume = ""

        if not isinstance(issue, str):
            issue = ""

        if not isinstance(page_start, str):
            page_start = ""

        if not isinstance(page_end, str):
            page_end = ""
        if parse_title:
            title = parse_string(title)
            
      
        body = {
            "query": {
                "bool": {
                    "should": [
                        {"match": { "title":  {
                                 "query": title,
                                 "operator": "OR"
                              }}},
                        #{"match": {"title": {"query":  title,"boost": 1}}},
                        # {"match": {"source":  source}},
                        { "match": { "source":  {
                                 "query": source,
                                 "operator": "AND"
                              }}},
                        {"term":  {"year": year}},
                        {"term":  {"volume": volume}},
                        {"term":  {"issue": issue}},
                        {"term":  {"page_start": page_start}},
                        {"term":  {"page_end": page_end}},
                    ]
                }
            },
            "size": 20,
        }

        res = self.es.search(index=self.es_index, **body)
        if res["hits"]["total"]["value"] != 0:
            best_hit = res["hits"]["hits"][0]
            if use_es_thold:
                if best_hit["_score"] < es_thold_low:
                    return None
                if best_hit["_score"] >= es_thold_high:
                    return best_hit

            for i in res["hits"]["hits"]:
                value = ColavSimilarity(title, i["_source"]["title"],
                                        source, i["_source"]["source"],
                                        year, i["_source"]["year"],
                                        ratio_thold=ratio_thold, partial_thold=partial_thold, low_thold=low_thold)
                if value:
                    return i
            return None
        else:
            return None

    def insert_bulk(self, entries: list, refresh=True):
        """
        Insert a bulk of works into the index.
        Parameters:
        -----------
        entries: list 
                list of works to be inserted
        """
        return bulk(self.es, entries, index=self.es_index, refresh=refresh, request_timeout=self.es_req_timeout)


In [8]:
s = Similarity("openalex_parsed",es_uri="172.19.31.8")

In [4]:
# open a file, where you stored the pickled data
file = open('dataset_full.pkl', 'rb')
# dump information to that file
data = pickle.load(file)
# close the file
file.close()

In [12]:

hpo=[]
for rt in range(90,100):
    for pt in range(90,100):
        for lt in range(80,85):
            print(f"new opts  rt= {rt} pt={pt} lt={lt}")
            start = time.time()
            opt = {}
            opt["rt"] = rt
            opt["pt"] = pt
            opt["lt"] = lt
            
            fns = []
            fps = []

            for i in range(15):
                if i%5 == 0:
                    print(i)
                scopus = []
                np.random.shuffle(data["p"])
                np.random.shuffle(data["n"])
                for i in data["p"][0:1000]:
                    scopus.append((i,True))
                for i in data["n"][0:1000]:
                    scopus.append((i,False))
                np.random.shuffle(scopus) #shuffle is inplace
                results = Parallel(n_jobs=72,backend='threading',verbose=0)(delayed(s.search_work)(rec[0]['Title'],rec[0]['Source title'],rec[0]['Year'],
                                 rec[0]['Volume'], rec[0]['Issue'], rec[0]['Page start'], rec[0]['Page end'],
                                 use_es_thold = False, es_thold_low = 0, es_thold_high = 180,#) for rec in scopus)
                                 ratio_thold = rt, partial_thold = pt, low_thold = lt) for rec in scopus)
                actual = []
                predicted = []
                for i in range(len(results)):
                    actual.append(scopus[i][1])
                    if results[i] is not None:
                        predicted.append(True)
                    else:
                        predicted.append(False)
                confusion_matrix = metrics.confusion_matrix(actual, predicted)
                confusion_matrix = np.flip(confusion_matrix)
                fn = confusion_matrix[0][1]*100/(confusion_matrix[0][0]+confusion_matrix[0][1])
                fns.append(fn)
                fp = confusion_matrix[1][0]*100/(confusion_matrix[1][0]+confusion_matrix[1][1])
                fps.append(fp)
            opt["fns"] = fns
            opt["fps"] = fps
            hpo.append(opt)
            stop = time.time()
            print(f"run time = {stop-start} sec")

new opts  rt= 90 pt=90 lt=80
0
5
10
run time = 135.0704004764557 sec
new opts  rt= 90 pt=90 lt=81
0
5
10
run time = 135.46503472328186 sec
new opts  rt= 90 pt=90 lt=82
0
5
10
run time = 134.83519506454468 sec
new opts  rt= 90 pt=90 lt=83
0
5
10
run time = 134.08307147026062 sec
new opts  rt= 90 pt=90 lt=84
0
5
10
run time = 135.12980675697327 sec
new opts  rt= 90 pt=91 lt=80
0
5
10
run time = 134.96414136886597 sec
new opts  rt= 90 pt=91 lt=81
0
5
10
run time = 135.7135729789734 sec
new opts  rt= 90 pt=91 lt=82
0
5
10
run time = 134.86275100708008 sec
new opts  rt= 90 pt=91 lt=83
0
5
10
run time = 134.68985867500305 sec
new opts  rt= 90 pt=91 lt=84
0
5
10
run time = 135.40024399757385 sec
new opts  rt= 90 pt=92 lt=80
0
5
10
run time = 135.67176938056946 sec
new opts  rt= 90 pt=92 lt=81
0
5
10
run time = 136.32193422317505 sec
new opts  rt= 90 pt=92 lt=82
0
5
10
run time = 136.09518718719482 sec
new opts  rt= 90 pt=92 lt=83
0
5
10
run time = 136.5523579120636 sec
new opts  rt= 90 pt=92 

In [13]:
with open('hpo.pkl', 'wb') as f:
    pickle.dump(hpo, f)

In [26]:
min_fns = np.mean(hpo[0]["fns"])
min_fps = np.mean(hpo[0]["fps"])
min_fns = np.mean(hpo[0]["fns"])
min_fps = np.mean(hpo[0]["fps"])
for i in hpo:
    if np.mean(i["fns"])<min_fns:
        min_fns = np.mean(i["fns"])
        min_fns_opts = i
    if np.mean(i["fps"])<min_fps:
        min_fps = np.mean(i["fps"])
        min_fps_opts = i
    
    

In [30]:
min_fns_opts

{'rt': 90,
 'pt': 92,
 'lt': 81,
 'fns': [0.4,
  0.5,
  0.3,
  1.0,
  1.1,
  1.3,
  0.4,
  0.8,
  0.7,
  0.8,
  0.2,
  0.3,
  0.2,
  0.8,
  0.2],
 'fps': [5.5,
  6.6,
  6.2,
  6.3,
  5.5,
  6.7,
  7.0,
  6.1,
  7.4,
  6.7,
  7.0,
  7.2,
  7.0,
  5.8,
  6.7]}

In [29]:
min_fps_opts

{'rt': 99,
 'pt': 98,
 'lt': 82,
 'fns': [1.4,
  1.2,
  1.6,
  1.5,
  1.2,
  2.4,
  1.5,
  1.4,
  1.0,
  1.2,
  1.7,
  1.5,
  1.4,
  1.6,
  1.4],
 'fps': [5.2,
  5.7,
  6.2,
  5.0,
  4.9,
  4.5,
  4.5,
  5.3,
  4.9,
  4.8,
  4.4,
  4.6,
  4.2,
  4.3,
  5.0]}

In [None]:
plt.figure(figsize=(8,6))
plt.hist(fns, bins=10, alpha=0.5, label="data1", density=False)
True

In [None]:
plt.figure(figsize=(8,6))
plt.hist(fps, bins=10, alpha=0.5, label="data1", density=False)
True

In [None]:
#0.71 no ES th
#0.69 boost 1 size =5
np.mean(fns)

In [None]:
np.std(fns)

In [None]:
np.mean(fps)

In [None]:
np.std(fps)

In [None]:
with open('similarity_fns_fps_noauthors_ratio_thold95.pkl', 'wb') as f:
    pickle.dump({"fns":fns,"fps":fps}, f)