## BM25 Queries ##

In [1]:
import linecache
from typing import List, Tuple
import spacy
import gensim
from gensim import corpora
from gensim.summarization.bm25 import BM25

In [2]:
def querytransform(query: str, nlp) -> str:
    doc = nlp(query)
    return ' '.join([token.lemma_ for token in doc if not token.is_punct])

In [3]:
def querydoc2corpus(query: str, 
                 dictname: gensim.corpora.dictionary.Dictionary) -> List[Tuple[int, int]]:
    return dictname.doc2bow(query.split())

In [21]:
def queryresults(sims: List[Tuple[int, float]], source: str, nresults=20) -> None:
    print('Document', '\t', 'Score', '\t\t', 'Text')
    for doc, score in sims[0:nresults]:
        print(doc, '\t\t{:.4f}'.format(score), '\t', linecache.getline(source, doc + 1)[0:500], '\n')

In [5]:
def average_idf(bm25: gensim.summarization.bm25.BM25) -> float:
    return sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

In [6]:
def similarities(scores: List[float]) -> List[Tuple[int, float]]:
    return sorted([(index, score) for index, score in enumerate(scores)], key=lambda x: x[1], reverse=True)

### Similarities from BM25 Scores ###

**Preprocess the Corpus**

In [7]:
# File String Constants
TERMDICTFILE = '../models/titleabstract.dict'
TERMCORPUSFILE = '../models/titleabstract_corpus.mm'

In [8]:
nlp = spacy.load('en')

In [9]:
term_dict = corpora.Dictionary.load(TERMDICTFILE)

In [10]:
corpus = corpora.MmCorpus(TERMCORPUSFILE)

In [11]:
bm_25_obj = BM25(corpus)

**Query the Corpus**

In [26]:
query = 'multilayer ceramic capacitor'

In [27]:
query_transform = querytransform(query, nlp)

In [28]:
query_transform

'multilayer ceramic capacitor'

In [29]:
sims = similarities(bm_25_obj.get_scores(querydoc2corpus(query_transform, term_dict), average_idf(bm_25_obj)))

In [30]:
queryresults(sims, '../intermediate/titleabstract.txt', nresults=15)

Document 	 Score 		 Text
9149 		32.6444 	 Mounting substrate A multilayer ceramic capacitor connected to an output electrode and an input electrode of a mounting substrate includes a laminated body. In the laminating direction of the laminated body, the shortest distance from an outer first internal electrode to the surface of an external electrode on the side closer to a first principal surface, and the shortest distance from an outer second internal electrode to the surface of an external electrode on the side closer to a second princ 

13985 		32.6444 	 Composite perovskite powder, preparation method thereof, and paste composition for internal electrode having the same There are provided a composite perovskite powder, a preparation method thereof, and a paste composition for an internal electrode having the same, the composite perovskite powder capable of preventing ions from being eluted from an aqueous system at the time of synthesis while being ultra-atomized, such that when the 