In [222]:
import pandas as pd
from elasticsearch import Elasticsearch
import math

In [4]:
citation_meta = pd.read_json('meta.jsonl', lines=True)

In [29]:
citation_meta.head()

Unnamed: 0,abstract,authors,cited_by,references,title,year,corpus_id,doc_id
0,PROBLEM STATEMENT\nPelvic girdle pain (PGP) is...,"[40572137, 3675075, 48815127]","[e7e7a7fc07f516fd39b4b0cb7ff3a2acfe837c1a, e87...","[420604a2d0161cd5b5d2df75dd6252f224c8b055, e78...",Pelvic Girdle Pain during or after Pregnancy: ...,2013.0,1417926,0003aa77bdefc1c75f9d2ba732635c132fc0c863
1,Routers must perform packet classification at ...,"[23633340, 1688025, 1746289]","[76d64770fc8f032d047b034650c666e2c731c87e, bfb...","[0e541308cc7c5ce8574bab03c090b6a0c5c6355b, 1a1...",Packet Classification Using Tuple Space Search,1999.0,207574370,0007181efc556fd1fcda2642e9bd85dd0f0c32d6
2,The data of interest are assumed to be represe...,"[1869497, 3143096, 1746676]","[f354b0103c4bc8cb14bed77a27f5f4ffe580efdb, 7bf...","[d68725804eadecf83d707d89e12c5132bf376187, 57b...",Bayesian Compressive Sensing,2008.0,206797074,000c009765a276d166fc67595e107a9bc44f230d
3,Semantic image segmentation is an essential co...,"[3408089, 36665147, 11983029, 1789756]","[c2c0fda9b4e2a12fd4069ab545e90ec4a197e66d, 1fa...","[981fef7155742608b8b6673f4a9566158b76cd67, 942...",Full-Resolution Residual Networks for Semantic...,2017.0,1873339,000f90380d768a85e2316225854fc377c079b5c4
4,We introduce the social study of bullying to t...,"[1729642, 2610963, 1832364, 3009549]","[6618b4dbea9cc0a229e603c0326eac957c420ac4, 49b...","[639c1ec9edcbca7aa80ab56a52487def431aed5e, 899...",Learning from Bullying Traces in Social Media,2012.0,9912528,00111610254bfb8ec16428501c2ca68dcf817474


In [6]:
import json

# Read the JSONL file into a list
with open('meta.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

# Print the list of dictionaries
print(data[0])

{'abstract': "PROBLEM STATEMENT\nPelvic girdle pain (PGP) is a common condition during or after pregnancy with pain and disability as most important symptoms. These symptoms have a wide range of clinical presentation. Most doctors perceive pregnancy related pelvic girdle pain (PPGP) as 'physiologic' or 'expected during pregnancy', where no treatment is needed. As such women with PPGP mostly experience little recognition. However, many scientific literature describes PPGP as being severe with considerable levels of pain and disability and socio-economic consequences in about 20% of the cases.\n\n\nOBJECTIVES\nWe aimed to (1) inform the gynecologist/obstetrician about the etiology, diagnosis, risk factors, and treatment options of PPGP and (2) to make a proposition for an adequate clinical care path.\n\n\nMETHODS\nA systematic search of electronic databases and a check of reference lists for recent researches about the diagnosis, etiology, risk factors and treatment of PPGP.\n\n\nRESULTS

In [7]:
# Delete corpus_id in doc to avoid Elasticsearch parsing error. 
# Some corpus_ids is the same as doc_ids, which makes them in different data type from other corpus_ids. 
# This may cause paring error during ElasticSearch indexing.
for dictionary in data:
    del(dictionary['corpus_id'])

In [28]:
es = Elasticsearch(hosts=["http://localhost:9200"])
es.info()

ObjectApiResponse({'name': 'Jamie-Suns-MacBook-Pro.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'yGzXocvqRKafPLJDOdcQTg', 'version': {'number': '8.5.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'c94b4700cda13820dad5aa74fae6db185ca5c304', 'build_date': '2022-10-24T16:54:16.433628434Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
# The total num of docs exceeds 140k, so it is more effective to index only the candidate docs. 
# es.indices.delete(index='scidoc_citation')
# for doc in data:
#     es.index(index="scidoc_citation", 
#              id = doc['doc_id'], 
#              document = doc
#             )

In [63]:
es.count(index="scidoc_citation")

ObjectApiResponse({'count': 142009, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [13]:
es.count(index="s2_doc")

ObjectApiResponse({'count': 8541, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [31]:
citation_prediction = pd.read_json('test_qrel.jsonl', lines=True)

In [32]:
citation_prediction.head()

Unnamed: 0,query_id,cand_id,score
0,78495383450e02c5fe817e408726134b3084905d,632589828c8b9fca2c3a59e97451fde8fa7d188d,1
1,78495383450e02c5fe817e408726134b3084905d,86e87db2dab958f1bd5877dc7d5b8105d6e31e46,1
2,78495383450e02c5fe817e408726134b3084905d,2a047d8c4c2a4825e0f0305294e7da14f8de6fd3,1
3,78495383450e02c5fe817e408726134b3084905d,506172b0e0dd4269bdcfe96dda9ea9d8602bbfb6,1
4,78495383450e02c5fe817e408726134b3084905d,51317b6082322a96b4570818b7a5ec8b2e330f2f,1


In [33]:
def get_query_candidates(df):
    positive_candidates = {}
    all_candidates = {}
    for query, query_df in df.groupby('query_id'):
        positive_candidates[query] = query_df[query_df['score'] == 1]['cand_id'].tolist()
        all_candidates[query] = query_df['cand_id'].tolist()
    return positive_candidates, all_candidates

In [34]:
# Key: query_id Value: positive_candidates, all_candidates 
positive_candidates, all_candidates = get_query_candidates(citation_prediction)

In [50]:
# Get text of queries. Quert id is the id of the query paper. 
# Key: query_id Value: query text
test_queries_id = citation_prediction['query_id'].unique().tolist()
test_queries_text = {}
for query_id in test_queries_id:
    temp = citation_meta[citation_meta['doc_id']==query_id]
    # Some query papers don't have abstract
    if temp['abstract'].values:
        text = temp['title'].values[0] + temp['abstract'].values[0]
    test_queries_text[query_id] = text

In [66]:
def search_BM25(query, doc_ids):
    MAX_SIZE = 5
    payload = {
                "bool": {
                  "must": [
                        {
                          "ids": {
                            "values": doc_ids
                          }
                        }
                  ],
                  "should": [
                    {
                      "multi_match": {
                        "query": query,
                        "fields": ["title", "abstract"]
                      }
                    }
                  ]
                }
              }
    resp = es.search(index="scidoc_citation", query=payload, size=MAX_SIZE)
    result = []
    # Extract info needed for testing and display. 
    for item in resp['hits']['hits']:
        result.append(item['_id'])
    return result

In [67]:
search_BM25(str(test_queries_text['78495383450e02c5fe817e408726134b3084905d']),all_candidates['78495383450e02c5fe817e408726134b3084905d'])

['86e87db2dab958f1bd5877dc7d5b8105d6e31e46',
 '632589828c8b9fca2c3a59e97451fde8fa7d188d',
 '22fc3af1fb55d48f3c03cd96f277503e92541c60',
 '12f107016fd3d062dff88a00d6b0f5f81f00522d',
 'c9d41f115eae5e03c5ed45c663d9435cb66ec942']

In [85]:
test_queries_text['78495383450e02c5fe817e408726134b3084905d']

'A Direct Search Method to solve Economic Dispatch Problem with Valve-Point EffectScarcity of energy resources, increasing power generation cost and ever-growing demand for electric energy necessitates optimal economic dispatch in today’s power systems. The main objective of economic dispatch is to reduce the total power generation cost, while satisfying various equality and inequality constraints. Traditionally in economic dispatch problems, the cost function for generating units has been approximated as a quadratic function which doesn’t provide accurate results. Moreover, to obtain accurate fuel cost, valve-point effect in thermal power plant has to be taken into account. The inclusion of valve-point effect makes the modeling of the fuel cost functions of generating units more practical. In this paper a new evolutionary algorithm called Pattern Search Technique, has been employed to solve economic dispatch problem with the valve-point effect. Using this technique the non-linear cost

In [70]:
def calculate_map(relevant_docs, retrieval_results):
    """
    Calculates the mean average precision (MAP) given a list of relevant documents
    and a list of retrieval results.
    
    Inputs:
    relevant_docs: A list of lists, where each inner list contains the relevant
                          document IDs for a query.
    retrieval_results: A list of lists, where each inner list contains the
                              document IDs retrieved by a model for a query.
    
    return: The mean average precision (MAP).
    """
    average_precision = 0
    num_queries = len(relevant_docs)
    
    for i in range(num_queries):
        query_rel_docs = relevant_docs[i]
        query_ret_results = retrieval_results[i]
        
        precision_at_k = []
        num_rel_docs = 0
        
        for k in range(len(query_ret_results)):
            if query_ret_results[k] in query_rel_docs:
                num_rel_docs += 1
                precision_at_k.append(num_rel_docs / (k + 1))
        
        if len(precision_at_k) > 0:
            average_precision += sum(precision_at_k) / len(precision_at_k)
    
    mean_average_precision = average_precision / num_queries
    return mean_average_precision

In [83]:
def final_result(test_queries, positive_candidates, all_candidates):
    retrieval_results = []
    relevant_docs = []
    for query_id, query_text in test_queries.items():
        retrieved = search_BM25(query_text, all_candidates[query_id])
        retrieval_results.append(retrieved)
        relevant_docs.append(positive_candidates[query_id])
    score = calculate_map(relevant_docs, retrieval_results)
    return score

In [84]:
import time
start = time.time()
BM25_score = final_result(test_queries_text, positive_candidates, all_candidates)
end = time.time()
print("BM25 MAP score is: ", BM25_score)
print("Total time used is ", end - start)

BM25 MAP score is:  0.9156875000000022
Total time used is  32.19122672080994


### Re-rank with harmonic centrality

In [173]:
import time
def calculate_harmonic_centralities(all_candidates, candidates_edges_dict):
    result = []
    for query_id, candidates in all_candidates.items():
        start = time.time()
        nodes = candidates
        edges = candidates_edges_dict[query_id]
        
        # Create a graph from the list of edges
        G = nx.Graph(edges)
    
        # Calculate harmonic centralities for each node in the input node list
        harmonic_centralities = nx.harmonic_centrality(G, nodes)
        result.append(harmonic_centralities)
        
        end = time.time()
        print(query_id, " done in {}.\n".format(end - start))
        
    return result

In [None]:
# ### Calculate candidates_edges_dict
# candidates_edges_dict = {}
# i = 0
# for query_id, candidates in all_candidates.items():
#     edges = []
#     for candidate in candidates:
#         temp = citation_meta[citation_meta['doc_id'] == candidate]
#         citations = temp['cited_by'].values[0]
#         references = temp['references'].values[0]
#         for paper in citations:
#             if paper in candidates:
#                 edges.append((paper, candidate))
        
#         for paper in references:
#             if paper in candidates:
#                 edges.append((candidate, paper))
#     candidates_edges_dict[query_id] = edges
#     print(i, ' done.')
#     i += 1

In [172]:
## A new version of candidates_edges_dict
## It doesn't exclude edges with nodes which are not candidate papers. 
candidates_edges_dict_no_exclusion = {}
for query_id, candidates in all_candidates.items():
    edges = []
    for candidate in candidates:
        temp = citation_meta[citation_meta['doc_id'] == candidate]
        citations = temp['cited_by'].values[0]
        references = temp['references'].values[0]
        for paper in citations:
                edges.append((paper, candidate))
        
        for paper in references:
                edges.append((candidate, paper))
    candidates_edges_dict_no_exclusion[query_id] = edges        

In [176]:
import copy
candidates_edges_dict_no_exclusion = copy.deepcopy(candidates_edges_dict)

In [166]:
# harmonic_centralities = calculate_harmonic_centralities(all_candidates, candidates_edges_dict)

In [None]:
# The calculation here with only one process will take about 49 hours. 
# Use ten programs to calculate faster and store the results in json files. 
# Can create a parallel computing program in the future. 
harmonic_centralities_no_exclusion = calculate_harmonic_centralities(all_candidates, candidates_edges_dict_no_exclusion)

In [201]:
def calculate_harmonic_centralities_missed(all_candidates, candidates_edges_dict, query_ids):
    result = []        
    start = time.time()
    for i in [99, 199, 299, 399, 499, 599, 699, 799, 899, 999]:
        query_id = query_ids[i]
        candidates = all_candidates[query_id]
        nodes = candidates
        edges = candidates_edges_dict[query_id]

        # Create a graph from the list of edges
        G = nx.Graph(edges)

        # Calculate harmonic centralities for each node in the input node list
        harmonic_centralities = nx.harmonic_centrality(G, nodes)
        result.append(harmonic_centralities)

        end = time.time()
        print(i, " ", query_id, " done in {}.\n".format(end - start))
        
    return result

In [203]:
query_ids = citation_prediction['query_id'].unique()
harmonic_centralities_no_exclusion_missed = calculate_harmonic_centralities_missed(all_candidates, candidates_edges_dict, query_ids)

99   f754cab548f2c209ea7d932084ef768b92b27614  done in 272.2202181816101.

199   6a40ffc156aea0c9abbd92294d6b729d2e5d5797  done in 299.5810580253601.

299   1b9ce6abc0f3024b88fcd4dbd0c10cf5bcf7d38d  done in 338.6756942272186.

399   6affd37b83d4fca0d0e54e5d75433a74cb142671  done in 348.21721029281616.

499   57ad8dfb71589360810da83efce67b4cab2ff380  done in 782.3319251537323.

599   ce00fc554965ea7b187dfa93292013f019d67d39  done in 783.0188040733337.

699   2a8bcf35e6b5b3910eea160b3a1fb3e6bcb3966e  done in 1515.6600120067596.

799   3388d516fc26536423b03e1d93a3b62358a6a35f  done in 1597.1662590503693.

899   40cdd0de8d05c496ca6658d3d7c45bea028de6be  done in 1597.3775870800018.

999   89e58773fa59ef5b57f229832c2a1b3e3efff37e  done in 1605.3648509979248.



In [211]:
type(harmonic_centralities_no_exclusion_missed[0])

dict

In [212]:
type(temp_centralities)

list

In [213]:
harmonic_centralities_no_exclusion = []
j = 0
for i in range(99, 1099, 100):
    file_name = 'harmonic_centralities_no_exclusion_' + str(i) + '.json' 
        temp_centralities = json.load(f)
        harmonic_centralities_no_exclusion += temp_centralities
        harmonic_centralities_no_exclusion += [harmonic_centralities_no_exclusion_missed[j]]
        j += 1

In [215]:
temp_18 = json.dumps(harmonic_centralities_no_exclusion)

# open file for writing, "w" 
f = open("harmonic_centralities_no_exclusion_total.json","w")

# write json object to file
f.write(temp_18)

# close file
f.close()

In [220]:
maximum = 0
minimum = 10
total = 0
for score_dict in harmonic_centralities_no_exclusion:
    for key, value in score_dict.items():
        if value > maximum:
            maximum = value
        if value < minimum:
            minimum = value
        total += value
print(maximum, minimum)

42338.666666721714 1.0


In [223]:
transformed_harmonic_centralities = []
for score_dict in harmonic_centralities_no_exclusion:
    temp_harmonic = {}
    for key, value in score_dict.items():
        temp_harmonic[key] = math.log(value + 3 * 10 ** -2)
    transformed_harmonic_centralities.append(temp_harmonic)

In [225]:
def search_BM25_rerank(query, doc_ids):
    MAX_SIZE = 30
    payload = {
                "bool": {
                  "must": [
                        {
                          "ids": {
                            "values": doc_ids
                          }
                        }
                  ],
                  "should": [
                    {
                      "multi_match": {
                        "query": query,
                        "fields": ["title", "abstract"]
                      }
                    }
                  ]
                }
              }
    resp = es.search(index="scidoc_citation", query=payload, size=MAX_SIZE)
    result = {}
    # Extract info needed for testing and display. 
    for item in resp['hits']['hits']:
        result[item['_id']] = item['_score']
    return result

In [216]:
def rerank_harmonic_old(result, harmonic_centralities):
    for i in range(len(result)):
        papers = result[i]
        if harmonic_centralities[i]:
            id_score = [('', 0) for _ in range (10)]
            for i in range(len(papers)):
                doc_id = papers[i]
                score = 0
#                 print(doc_id)
                if doc_id in harmonic_centralities[i]:
                    score = harmonic_centralities[i][doc_id]
                id_score[i] = (doc_id, score)
            id_score = sorted(id_score, key=lambda x: x[1])
            result[i] = [id_score[i][0] for i in range(5)] 
        else:
            result[i] = papers[:5]
    return result

In [226]:
def rerank_harmonic(BM25_retrieval_results, harmonic_centralities):
    results = []
    for alpha in range(1, 10):
        alpha /= 10
        epoch_result = []
        for i in range(len(BM25_retrieval_results)):
            doc_id_score_dict = BM25_retrieval_results[i]
            id_score = [('', 0) for _ in range (len(doc_id_score_dict))]
            for j, (doc_id, BM25_score) in enumerate(doc_id_score_dict.items()):
                if doc_id in harmonic_centralities[i]:
                    total_score = alpha*harmonic_centralities[i][doc_id]+(1-alpha)*BM25_score
                else: 
                    total_score = (1-alpha) * BM25_score
                id_score[j] = (doc_id, total_score)
            id_score = sorted(id_score, key=lambda x: x[1], reverse=True)
            epoch_result.append([id_score[i][0] for i in range(5)]) 
        results.append(epoch_result)
    return results

In [160]:
def rerank_final_result_old(test_queries, positive_candidates, all_candidates, candidates_edges_dict):
        retrieval_results = []
        relevant_docs = []
        for query_id, query_text in test_queries.items():
            retrieved = search_BM25(query_text, all_candidates[query_id])
            retrieval_results.append(retrieved)
            relevant_docs.append(positive_candidates[query_id])
        harmonic_centralities = calculate_harmonic_centralities(all_candidates, candidates_edges_dict)
        print(len(retrieval_results), len(harmonic_centralities))
        retrieval_results = rerank_harmonic(retrieval_results, harmonic_centralities)
        score = calculate_map(relevant_docs, retrieval_results)
        return score

In [227]:
def rerank_final_result(test_queries, positive_candidates, all_candidates, transformed_harmonic_centralities):
    retrieval_results = []
    relevant_docs = []
    scores = []
    for query_id, query_text in test_queries.items():
        retrieved = search_BM25_rerank(query_text, all_candidates[query_id])
        retrieval_results.append(retrieved)
        relevant_docs.append(positive_candidates[query_id])
    retrieval_results = rerank_harmonic(retrieval_results, transformed_harmonic_centralities)
    for retrieval_result in retrieval_results:
        score = calculate_map(relevant_docs, retrieval_result)
        scores.append(score)
    return scores

In [228]:
start = time.time()
rerank_result = rerank_final_result(test_queries_text, positive_candidates, all_candidates, transformed_harmonic_centralities)
end = time.time()
# print("Re-ranking model MAP score is: ", rerank_result)
print("Total time used is ", end - start)

Total time used is  27.18969702720642


In [229]:
rerank_result

[0.9154138888888909,
 0.9170083333333354,
 0.9209361111111131,
 0.9252500000000018,
 0.9290361111111127,
 0.9321513888888914,
 0.9381888888888915,
 0.9457680555555578,
 0.9589555555555578]