In [1]:
import pandas as pd
import numpy as np
from spacy_sentence_bert import load_model
from spacy import load
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
from sklearn.metrics import dcg_score

## Preprocessing CISI data and moving the abstracts into BERT Vector Space

In [2]:
bert = load_model('en_stsb_bert_base')

In [3]:
# # BERT VECTOR SPACE EXAMPLE

# doc1 = bert('hello how are you buddy?')
# doc2 = bert('hello buddy')
# doc1.similarity(doc2)

In [4]:
nlp = load('en_core_web_sm')  # We can use this to lemmatize words later

# This helper function returns True if the word is a stopword
def filter_stopword(word):
    if word not in stopwords.words('english'):
        return True
    
    
# This helper function returns True if the word is a punctuation    
def filter_punctuation(word):
    if word not in punctuation:
        return True

# Preprocess a text by doing lowercase, removing stopwords, removing punctuations, and lemmatization    
def preprocess(text):
    text = text.lower()
    text = word_tokenize(text)
    text = list(filter(filter_stopword, text))
    text = list(filter(filter_punctuation, text))
    text = nlp(' '.join(text))
    text = ' '.join([word.lemma_ for word in text])
    
    return text

In [5]:
queries = pd.read_csv('data/cisi-query.csv')
queries

Unnamed: 0,qid,query
0,1,What problems and concerns are there in makin...
1,2,"How can actually pertinent data, as opposed t..."
2,3,What is information science? Give definition...
3,4,Image recognition and any other methods of au...
4,5,What special training will ordinary researche...
5,6,What possibilities are there for verbal commu...
6,7,Describe presently working and planned system...
7,8,Describe information retrieval and indexing i...
8,9,What possibilities are there for automatic gr...
9,10,The use of abstract mathematics in informatio...


In [6]:
texts = pd.read_csv('data/cisi-all.csv')
texts

Unnamed: 0,docid,abstract
0,1,The present study is a history of the DEWEY D...
1,2,This report is an analysis of 6300 acts of us...
2,3,The relationships between the organization an...
3,4,The establishment of nine new universities in...
4,5,Although the use of games in professional edu...
...,...,...
1455,1456,"Forrester, J.W. Over the last several decades..."
1456,1457,One of the most significant aspects of the ev...
1457,1458,The patent laws confer on a patentee power to...
1458,1459,This book considers the basic aspects of this...


In [7]:
texts['abstract'] = texts['abstract'].apply(preprocess)
texts

Unnamed: 0,docid,abstract
0,1,present study history dewey decimal classifica...
1,2,report analysis 6300 act use 104 technical lib...
2,3,relationships organization control writings or...
3,4,establishment nine new university 1960 's prov...
4,5,although use game professional education becom...
...,...,...
1455,1456,forrester j.w last several decade interest eco...
1456,1457,one significant aspect evolution librarianship...
1457,1458,patent law confer patentee power exclude other...
1458,1459,book consider basic aspect complex problem his...


In [8]:
texts['abstract'] = texts['abstract'].apply(bert)
texts

Unnamed: 0,docid,abstract
0,1,"(present, study, history, dewey, decimal, clas..."
1,2,"(report, analysis, 6300, act, use, 104, techni..."
2,3,"(relationships, organization, control, writing..."
3,4,"(establishment, nine, new, university, 1960, '..."
4,5,"(although, use, game, professional, education,..."
...,...,...
1455,1456,"(forrester, j.w, last, several, decade, intere..."
1456,1457,"(one, significant, aspect, evolution, libraria..."
1457,1458,"(patent, law, confer, patentee, power, exclude..."
1458,1459,"(book, consider, basic, aspect, complex, probl..."


## Ranking by calculating similarity

In [9]:
def rank_docs(query, df):
    # Moving the query into bert vector space
    query = bert(query)
    
    # Computing similarity score of all docs with query
    df2 = df.copy()
    df2['score'] = df2['abstract'].apply( lambda row: query.similarity(row) )
    df2.sort_values('score', ascending=False, inplace=True)
    
    print(df2[:10])
    return df2

In [11]:
query = input('Enter query: ')
query = preprocess(query)

ranked = rank_docs(query, texts)

Enter query: laws for patients
      docid                                           abstract     score
525     526  (use, on, -, line, information, retrieval, sys...  0.302778
414     415  (choose, grow, number, medical, paramedical, j...  0.282808
190     191  (medlearn, orientation, medline, develop, educ...  0.269313
902     903  (know, 3,071, county, parishe, 59, county, equ...  0.266361
1248   1249  (kind, librarianship, partake, general, elemen...  0.261727
195     196  (basic, journal, list, forty, -, eight, journa...  0.258532
86       87  (ratcliff, w.w, ., getchell, m.e, zeller, k., ...  0.257931
623     624  (use, medical, library, member, faculty, medic...  0.255021
193     194  (lancaster, f.w, paper, describe, possible, cr...  0.251917
401     402  (major, purpose, study, describe, analyze, eva...  0.248950


In [12]:
query = queries['query'][0]
print(query)
query = preprocess(query)

ranked_2 = rank_docs(query, texts)
ranked_2

 What problems and concerns are there in making up descriptive titles? What difficulties are involved in automatically retrieving articles from approximate titles? What is the usual relevance of the content of articles to their titles?
      docid                                           abstract     score
1223   1224  (text, processing, problem, automatic, transla...  0.739487
41       42  (analysis, problem, define, mutual, relevancy,...  0.727194
570     571  (together, increase, shortage, qualified, abst...  0.705363
428     429  (since, many, alerting, information, service, ...  0.699777
643     644  (indexing, document, among, crucial, step, pre...  0.696409
1135   1136  (essential, difference, datum, retrieval, syst...  0.694219
804     805  (bibliographic, citation, attach, technical, d...  0.693395
1053   1054  (experiment, describe, attempt, derive, quanti...  0.686762
491     492  (purpose, information, -, retrieval, system, p...  0.685486
1128   1129  (discuss, possibility

Unnamed: 0,docid,abstract,score
1223,1224,"(text, processing, problem, automatic, transla...",0.739487
41,42,"(analysis, problem, define, mutual, relevancy,...",0.727194
570,571,"(together, increase, shortage, qualified, abst...",0.705363
428,429,"(since, many, alerting, information, service, ...",0.699777
643,644,"(indexing, document, among, crucial, step, pre...",0.696409
...,...,...,...
1331,1332,"(baboon, adapt, variety, habitat, range, west,...",0.206405
1433,1434,"(catalogue, source, experiment, carry, united,...",0.161728
930,931,"(scheme, circulate, nonfiction, book, without,...",0.161616
929,930,"(century, public, library, service, progress, ...",0.152685


## Evaluation using DCG

In [13]:
ground_truth = pd.read_csv('data/cisi-rel.csv')
ground_truth

Unnamed: 0,docid,unknown-1,unknown-2,qid
0,28,0,0.0,1
1,35,0,0.0,1
2,38,0,0.0,1
3,42,0,0.0,1
4,43,0,0.0,1
...,...,...,...,...
3109,422,0,0.0,111
3110,448,0,0.0,111
3111,485,0,0.0,111
3112,503,0,0.0,111


In [14]:
# Retrieveing only ground truth of qid 1
ground_truth_qid_1 = ground_truth[ground_truth['qid']==1] 

# Retrieving docids of qid1 from ground truth and converting into proper 2D shape
docs_actual_qid_1 = np.array([ ground_truth_qid_1['docid'].to_list() ])
docs_relevant_orig = docs_actual_qid_1.shape[1]

# Retrieving the same amount of docids as in ground truth for qid1 from ranked documents (calculated) and converting into proper 2D shape
ranked_docids = ranked_2['docid'][:docs_relevant_orig]
docs_ranked_qid_1 = np.array( [ranked_docids.to_list()] )

# Calculating dcg score between actual docs of qid1 and calculated ranked docs of qid1
dcg_score(docs_actual_qid_1,  docs_ranked_qid_1)

6452.825409919786