###Import and preprocessing

In [1]:
#! pip install python-terrier

!pip install --upgrade python-terrier

!pip install --upgrade git+https://github.com/Georgetown-IR-Lab/OpenNIR
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5

Collecting python-terrier
  Downloading python-terrier-0.7.2.tar.gz (95 kB)
[?25l[K     |███▍                            | 10 kB 21.0 MB/s eta 0:00:01[K     |██████▉                         | 20 kB 12.8 MB/s eta 0:00:01[K     |██████████▎                     | 30 kB 8.9 MB/s eta 0:00:01[K     |█████████████▊                  | 40 kB 7.7 MB/s eta 0:00:01[K     |█████████████████▏              | 51 kB 5.5 MB/s eta 0:00:01[K     |████████████████████▋           | 61 kB 5.6 MB/s eta 0:00:01[K     |████████████████████████        | 71 kB 5.6 MB/s eta 0:00:01[K     |███████████████████████████▌    | 81 kB 6.3 MB/s eta 0:00:01[K     |███████████████████████████████ | 92 kB 6.3 MB/s eta 0:00:01[K     |████████████████████████████████| 95 kB 1.7 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius~=1.3.0
  Downloading pyjnius-1.3.0-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 33.6 MB/s 
[?25hCollectin

In [2]:
#libraries

import pyterrier as pt
#from pyterrier import IndexingType
import pandas as pd
import regex as re
import string 
from nltk.stem.porter import *
from nltk.tokenize import WordPunctTokenizer

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
covid19_synonyms_nospace = ['covid19',
                    'covid2019',
                    'coronavirusdisease19',
                    'coronavirusdisease2019',
                    'coviddisease2019',
                    'coviddisease19',
                    'sarscov2', 
                    '2019ncov',
                    'ncov2019',
                    'coronavirus2019',
                    'coronavirus2',
                    'wuhanpneumonia',
                    'wuhanvirus',
                    'wuhancoronavirus',
                    'novelcoronavirus',
                    'novelcovid'
                    ]

covid19_synonyms_space =[
                         'covid 19',
                         'covid 2019',
                         'coronavirus disease 19',
                         'ncov 2019',
                         'n cov2019',
                         'n cov 2019',
                         '2019n cov',
                         '2019 ncov',
                         '2019 n cov',
                         'coronavirus 2019',
                         'coronavirus 2',
                         'wuhan pneumonia',
                         'wuhan virus',
                         'wuhan coronavirus',
                         'novel coronavirus',
                         'novel covid',
                         'covid disease 2019',
                         'coronavirus disease 2019',
                         'covid disease 19',
                         'sars cov 2',
                         'sars cov2',
                         'sarscov 2'
]

def covid_preprocess(collection):
  for s in covid19_synonyms_space:
    collection['title'] = collection['title'].str.replace(s, s.replace(" ", ""))
    collection['text'] = collection['text'].str.replace(s, s.replace(" ", ""))
  return collection

In [5]:
#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation+'‐'+'′'+'–'+'‘'+'’'+'“'+'”'])
    return punctuationfree

import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
  tokenized_text = WordPunctTokenizer().tokenize(text)

  tokenized_text_without_stopwords = []
  for token in tokenized_text:
    if token.lower() not in stop_words:
      tokenized_text_without_stopwords.append(token)
  return " ".join(tokenized_text_without_stopwords)

def tokenization(text):
  tokenized_text = WordPunctTokenizer().tokenize(text)
  return tokenized_text

def covid_preprocess(text):
  for s in covid19_synonyms_space:
    text = text.replace(s, s.replace(" ", ""))
  return text

def remove_number_after_space(text):
  text = re.sub('\s(\d)+', ' ', text)
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
def preprocessing_first_part(t):
  t = t.apply(lambda x: x.lower())
  t = t.apply(lambda x: re.sub('-', ' ', x))
  t = t.apply(lambda x: remove_punctuation(x))
  t = t.apply(lambda x: remove_stopwords(x))
  #covid pre-processing
  t = t.apply(lambda x: covid_preprocess(x))
  t = t.apply(lambda x: remove_number_after_space(x))
  return t

In [7]:
#load datasets and init pyterrier

if not pt.started():
  pt.init()

root_dir = '/content/gdrive/MyDrive'
base_dir = root_dir + '/Progetto IR/data'

collection = pd.read_pickle(base_dir + "/collection.pkl")
qrels = pd.read_pickle(base_dir + "/qrels.pkl")
queries = pd.read_pickle(base_dir + "/queries.pkl")


terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done
PyTerrier 0.7.2 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)


In [8]:
collection['text_preprocess'] = preprocessing_first_part(collection['text'])
collection['title_preprocess'] = preprocessing_first_part(collection['title'])

In [9]:
queries['adhoc_preprocess'] = preprocessing_first_part(queries['query_adhoc'])
queries['desc_preprocess'] = preprocessing_first_part(queries['query_desc'])
queries['nar_preprocess'] = preprocessing_first_part(queries['query_nar'])

In [10]:
queries.head()

Unnamed: 0,index,qid,query,query_adhoc,query_desc,query_nar,adhoc_preprocess,desc_preprocess,nar_preprocess
0,0,1,,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...,coronavirus origin,origin covid19,seeking range information sarscov2 viruss orig...
1,1,2,,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...,coronavirus response weather changes,coronavirus respond changes weather,seeking range information sarscov2 virus viabi...
2,2,3,,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...,coronavirus immunity,sarscov2 infected people develop immunity cros...,seeking studies immunity developed due infecti...
3,3,4,,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...,people die coronavirus,causes death covid19,studies looking mechanisms death covid19
4,4,5,,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing d...,animal models covid19,drugs active sars cov sarscov2 animal studies,papers describe results testing drugs bind spi...


In [11]:
from pyterrier.measures import *
import onir_pt
from textblob import TextBlob 
nltk.download('averaged_perceptron_tagger')
import nltk
nltk.download('punkt')

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

###Indexing

In [12]:
collection.head()

Unnamed: 0,index,docno,text,title,doi,date,text_preprocess,title_preprocess
0,0,ug7v899j,OBJECTIVE: This retrospective chart review des...,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,2001-07-04,objective retrospective chart review describes...,clinical features culture proven mycoplasma pn...
1,1,02tnwd4m,Inflammatory diseases of the respiratory tract...,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,2000-08-15,inflammatory diseases respiratory tract common...,nitric oxide pro inflammatory mediator lung di...
2,2,ejv2xln0,Surfactant protein-D (SP-D) participates in th...,Surfactant protein-D and pulmonary host defense,10.1186/rr19,2000-08-25,surfactant protein sp participates innate resp...,surfactant protein pulmonary host defense
3,3,2b73a28n,Endothelin-1 (ET-1) is a 21 amino acid peptide...,Role of endothelin-1 in lung disease,10.1186/rr44,2001-02-22,endothelin et amino acid peptide diverse bi...,role endothelin lung disease
4,4,9785vg6d,Respiratory syncytial virus (RSV) and pneumoni...,Gene expression in epithelial cells in respons...,10.1186/rr61,2001-05-11,respiratory syncytial virus rsv pneumonia viru...,gene expression epithelial cells response pneu...


Different combination of properties and indexing are tried. The output of the best configuration is shown.

The results of the all executions are stored in 'data/reduction' folder.

In [13]:
# We have tried different combinations of properties
pt.set_property("termpipelines", "")
pt.set_property("termpipelines", "Stopwords")
pt.set_property("termpipelines", "PorterStemmer")

pd_indexer = pt.DFIndexer("/pd_index", overwrite = True)
#indexref = pd_indexer.index( collection["title_preprocess"], collection["docno"])collection["text_preprocess"],
indexref = pd_indexer.index(collection["text_preprocess"], collection["title_preprocess"],  collection["docno"])

15:12:09.749 [main] WARN org.terrier.structures.indexing.Indexer - Indexed 54891 empty documents
15:12:09.849 [main] ERROR org.terrier.structures.indexing.Indexer - Could not finish MetaIndexBuilder: 
java.io.IOException: Key 8lqzfj2e is not unique: 37597,11755
For MetaIndex, to suppress, set metaindex.compressed.reverse.allow.duplicates=true
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.mergeTwo(FSOrderedMapFile.java:1374)
	at org.terrier.structures.collections.FSOrderedMapFile$MultiFSOMapWriter.close(FSOrderedMapFile.java:1308)
	at org.terrier.structures.indexing.BaseMetaIndexBuilder.close(BaseMetaIndexBuilder.java:321)
	at org.terrier.structures.indexing.classical.BasicIndexer.createDirectIndex(BasicIndexer.java:346)
	at org.terrier.structures.indexing.Indexer.index(Indexer.java:369)


In [14]:
index = pt.IndexFactory.of(indexref)
print(index.getCollectionStatistics().toString())
print(index.getLexicon()["virus"].toString())

Number of documents: 192509
Number of terms: 172659
Number of postings: 11265707
Number of fields: 0
Number of tokens: 17002062
Field names: []
Positions:   false

term160 Nt=14707 TF=30276 maxTF=2147483647 @{0 15275232 4}


###Query Reduction 

In [15]:
def reset_queries():
  queries_adhoc = queries[['qid', 'adhoc_preprocess']]
  queries_adhoc.rename(columns = {"adhoc_preprocess" : "query"}, inplace=True)

  queries_desc = queries[['qid', 'desc_preprocess']]
  queries_desc.rename(columns = {"desc_preprocess" : "query"}, inplace=True)

  queries_narrative = queries[['qid', 'nar_preprocess']]
  queries_narrative.rename(columns = {"nar_preprocess" : "query"}, inplace=True)
  return queries_adhoc, queries_desc, queries_narrative


In [16]:
queries_adhoc = queries[['qid', 'adhoc_preprocess']]
queries_adhoc.rename(columns = {"adhoc_preprocess" : "query"}, inplace=True)

queries_desc = queries[['qid', 'desc_preprocess']]
queries_desc.rename(columns = {"desc_preprocess" : "query"}, inplace=True)

queries_narrative = queries[['qid', 'nar_preprocess']]
queries_narrative.rename(columns = {"nar_preprocess" : "query"}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [17]:
# Help function to analyze POS tagging
def print_tags(text): 

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    print(e)
  return text

###Experiments without query reduction

In [18]:
root_dir = '/content/gdrive/MyDrive'
base_dir = root_dir + '/Progetto IR/data/reduction/text_title_indexes/stop+stem/'

In [19]:
from pyterrier.measures import *

tfidf = pt.BatchRetrieve(index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(index, wmodel="BM25")
dir = pt.BatchRetrieve(index, wmodel="DirichletLM")
dph = pt.BatchRetrieve(index, wmodel="DPH")

results = pt.Experiment(
    [tfidf, bm25, dir, dph],
    queries_desc,
    qrels,
    names = ["tf-idf", "bm25", "DirichletLM", "DPH"],
    eval_metrics=[P@5,P@10,'ndcg'])
display(results)
results.to_csv(base_dir + "base_desc.csv", index = False)

results = pt.Experiment(
    [tfidf, bm25, dir, dph],
    queries_narrative,
    qrels,
    names = ["tf-idf", "bm25", "DirichletLM", "DPH"],
    eval_metrics=[P@5,P@10, 'ndcg'])
display(results)
results.to_csv(base_dir + "base_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.688,0.654,0.398215
1,bm25,0.68,0.636,0.401054
2,DirichletLM,0.512,0.526,0.358168
3,DPH,0.672,0.636,0.380484


Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.6,0.562,0.305087
1,bm25,0.608,0.544,0.311195
2,DirichletLM,0.38,0.348,0.236719
3,DPH,0.556,0.524,0.284363


###Experiments with query reduction

Help function to execute experiment given a query.

In [20]:
def experiment(q):
  results = pt.Experiment(
    [tfidf, bm25, dir, dph],
    q,
    qrels,
    names = ["tf-idf", "bm25", "DirichletLM", "DPH"],
    eval_metrics=[P@5,P@10,'ndcg'])
  display(results)
  return results

####Word high frequency removal

In [21]:
def remove_terms_frequency(text): 
  count = 0
  tokens = text.split()
  new = []
  for word in tokens:
    if word in index.getLexicon():
      if index.getLexicon()[word].getFrequency() > 15000:
        new.append('')
        count = count + 1
        print(word)
      else:
        new.append(word)
  print('Parole rimosse:', count)
  return ' '.join(new)




In [22]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_terms_frequency(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_terms_frequency(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


covid19
Parole rimosse: 1
Parole rimosse: 0
sarscov2
develop
Parole rimosse: 2
death
covid19
Parole rimosse: 2
cov
sarscov2
Parole rimosse: 2
covid19
Parole rimosse: 1
detect
Parole rimosse: 1
covid19
Parole rimosse: 1
covid19
Parole rimosse: 1
impact
spread
covid19
Parole rimosse: 3
Parole rimosse: 0
Parole rimosse: 0
Parole rimosse: 0
covid19
Parole rimosse: 1
Parole rimosse: 0
Parole rimosse: 0
Parole rimosse: 0
covid19
Parole rimosse: 1
type
covid19
Parole rimosse: 2
risk
covid19
Parole rimosse: 2
Parole rimosse: 0
covid19
Parole rimosse: 1
covid19
Parole rimosse: 1
covid19
Parole rimosse: 1
predict
Parole rimosse: 1
covid19
Parole rimosse: 1
covid19
Parole rimosse: 1
covid19
Parole rimosse: 1
sarscov2
human
drug
Parole rimosse: 3
treatment
covid19
Parole rimosse: 2
differ
Parole rimosse: 1
sarscov2
Parole rimosse: 1
covid19
Parole rimosse: 1
term
covid19
Parole rimosse: 2
new
public
covid19
Parole rimosse: 3
protein
sarscov2
Parole rimosse: 2
result
sarscov2
Parole rimosse: 2
covi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [23]:
#results = experiment(queries_desc)

#####Narrative Queries

In [24]:
#results = experiment(queries_narrative)

####JJ - JJS - JJR Removal

In [25]:
def remove_adjective(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'JJ' or e[1] == 'JJR' or e[1] == 'JJS':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [26]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_adjective(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_adjective(x))


 origin covid19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,



 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('possible', 'JJ')

 causes death covid19

 drugs active sars cov sarscov2 animal studies
('active', 'JJ')
('sarscov2', 'JJ')

 types rapid testing covid19 developed
('rapid', 'JJ')

 serological tests detect antibodies coronavirus
('serological', 'JJ')

 lack testing availability led underreporting true incidence covid19
('underreporting', 'JJ')
('true', 'JJ')

 covid19 affected canada

 social distancing impact slowing spread covid19
('social', 'JJ')

 guidelines triaging patients infected coronavirus

 best practices hospitals home maintaining quarantine
('best', 'JJS')

 transmission routes coronavirus

 evidence related covid19 super spreaders
('covid19', 'JJ')
('super', 'JJ')

 long coronavirus live outside body
('coronavirus', 'JJ')
('live', 'JJ')

 long coronavirus remain stable surfaces
('stable', 'JJ')

 clinical trials available coronavirus
('clinical', 'JJ')
('availab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [27]:
results = experiment(queries_desc)
results.to_csv(base_dir + "JJ - JJS - JJR Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.576,0.546,0.333267
1,bm25,0.572,0.548,0.337298
2,DirichletLM,0.432,0.426,0.28956
3,DPH,0.552,0.56,0.319671


#####Narrative Queries

In [28]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "JJ - JJS - JJR Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.46,0.458,0.26425
1,bm25,0.448,0.448,0.268732
2,DirichletLM,0.316,0.316,0.200397
3,DPH,0.408,0.42,0.241581


####JJS - JJR Removal

In [29]:
def remove_JJR_JJS(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'JJR' or e[1] == 'JJS':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [30]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_JJR_JJS(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_JJR_JJS(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19

 covid19 affected canada

 social distancing impact slowing spread covid19

 guidelines triaging patients infected coronavirus

 best practices hospitals home maintaining quarantine
('best', 'JJS')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body

 long coronavirus remain stable surfaces

 clinical trials available coronavirus

 best masks preventing infection covid19
('best', 'JJS')

 type hand sanitizer needed destroy covid19

 patients taking angiotensin converting enzyme inhibitors ace increased risk covid19

 mortality rates overall specific populations

 cardiac com

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [31]:
queries_narrative.head()

Unnamed: 0,qid,query
0,1,seeking range information sarscov2 viruss orig...
1,2,seeking range information sarscov2 virus viabi...
2,3,seeking studies immunity developed due infecti...
3,4,studies looking mechanisms death covid19
4,5,papers describe results testing drugs bind spi...


#####Description Queries

In [32]:
results = experiment(queries_desc)
results.to_csv(base_dir + "JJS - JJR Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.696,0.664,0.399647
1,bm25,0.684,0.65,0.402808
2,DirichletLM,0.52,0.536,0.359265
3,DPH,0.672,0.648,0.38276


#####Narrative Queries

In [33]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "JJS - JJR Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.588,0.558,0.304304
1,bm25,0.596,0.542,0.310499
2,DirichletLM,0.376,0.346,0.236817
3,DPH,0.544,0.52,0.283411


####RB - MD - PRP - CD - FW - DT Removal

In [34]:
def remove_RB_MD_PRP_CD_FW_DT(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'RB' or  e[1] == 'MD' or  e[1] == 'PRP' or  e[1] == 'CD' or  e[1] == 'FW' or  e[1] == 'DT':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [35]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT(x))


 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19

 covid19 affected canada

 social distancing impact slowing spread covid19

 guidelines triaging patients infected coronavirus

 best practices hospitals home maintaining quarantine

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body
('long', 'RB')

 long coronavirus remain stable surfaces
('long', 'RB')

 clinical trials available coronavirus

 best masks preventing infection covid19

 type hand sanitizer needed destroy covid19

 patients taking angiotensin converting enzyme inhibitors ace increased risk covid19

 mortality rates overall specific populations

 cardiac compl

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [36]:
results = experiment(queries_desc)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.688,0.658,0.399195
1,bm25,0.684,0.64,0.402
2,DirichletLM,0.512,0.528,0.358752
3,DPH,0.676,0.64,0.381425


#####Narrative Queries

In [37]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.608,0.562,0.305072
1,bm25,0.608,0.546,0.310823
2,DirichletLM,0.384,0.352,0.237223
3,DPH,0.564,0.532,0.284941


####RB - MD - PRP - CD - FW - DT - JJS - JJR Removal

In [38]:
def remove_RB_MD_PRP_CD_FW_DT_JJS_JJR(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'RB' or  e[1] == 'MD' or  e[1] == 'PRP' or  e[1] == 'CD' or  e[1] == 'FW' or  e[1] == 'DT' or  e[1] == 'JJS' or  e[1] == 'JJR':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [39]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR(x))


 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19

 covid19 affected canada

 social distancing impact slowing spread covid19

 guidelines triaging patients infected coronavirus

 best practices hospitals home maintaining quarantine
('best', 'JJS')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body
('long', 'RB')

 long coronavirus remain stable surfaces
('long', 'RB')

 clinical trials available coronavirus

 best masks preventing infection covid19
('best', 'JJS')

 type hand sanitizer needed destroy covid19

 patients taking angiotensin converting enzyme inhibitors ace increased risk covid19

 mortality rates overall spec

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [40]:
results = experiment(queries_desc)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.696,0.668,0.400626
1,bm25,0.688,0.654,0.403755
2,DirichletLM,0.52,0.538,0.359849
3,DPH,0.676,0.652,0.383701


**This is the best result obtained for the description queries**

#####Narrative Queries

In [41]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.596,0.558,0.304249
1,bm25,0.596,0.544,0.31008
2,DirichletLM,0.38,0.35,0.23732
3,DPH,0.552,0.53,0.283988


####RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD Removal

In [42]:
def remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'RB' or  e[1] == 'MD' or  e[1] == 'PRP' or  e[1] == 'CD' or  e[1] == 'FW' or  e[1] == 'DT' or  e[1] == 'JJS' or  e[1] == 'JJR' or  e[1] == 'VBG' or  e[1] == 'VBD':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [43]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('infected', 'VBD')

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed
('testing', 'VBG')
('developed', 'VBD')

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19
('testing', 'VBG')
('led', 'VBD')

 covid19 affected canada
('affected', 'VBD')

 social distancing impact slowing spread covid19
('distancing', 'VBG')
('slowing', 'VBG')

 guidelines triaging patients infected coronavirus
('triaging', 'VBG')

 best practices hospitals home maintaining quarantine
('best', 'JJS')
('maintaining', 'VBG')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body
('long', 'RB')

 long coronavirus remain stable surfaces
('long', 'RB')

 clinical trials available coronavirus

 best masks preventi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [44]:
results = experiment(queries_desc)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.68,0.628,0.385813
1,bm25,0.66,0.624,0.38771
2,DirichletLM,0.512,0.5,0.344932
3,DPH,0.644,0.62,0.370452


#####Narrative Queries

In [45]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.62,0.568,0.308587
1,bm25,0.604,0.566,0.315221
2,DirichletLM,0.42,0.408,0.266051
3,DPH,0.592,0.558,0.28931


####VB - VBG - VBD Removal

In [46]:
def remove_VB_VBG_VBD(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'VB' or e[1] == 'VBG' or e[1] == 'VBD':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [47]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_VB_VBG_VBD(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_VB_VBG_VBD(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('infected', 'VBD')

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed
('testing', 'VBG')
('developed', 'VBD')

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19
('testing', 'VBG')
('led', 'VBD')

 covid19 affected canada
('affected', 'VBD')

 social distancing impact slowing spread covid19
('distancing', 'VBG')
('slowing', 'VBG')

 guidelines triaging patients infected coronavirus
('triaging', 'VBG')

 best practices hospitals home maintaining quarantine
('maintaining', 'VBG')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body

 long coronavirus remain stable surfaces

 clinical trials available coronavirus

 best masks preventing infection covid19
('preventing', 'VBG')

 t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [48]:
results = experiment(queries_desc)
results.to_csv(base_dir + "VB - VBG - VBD Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.672,0.612,0.381283
1,bm25,0.652,0.604,0.383781
2,DirichletLM,0.5,0.488,0.342418
3,DPH,0.632,0.602,0.364791


#####Narrative Queries

In [49]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "VB - VBG - VBD Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.62,0.564,0.308971
1,bm25,0.616,0.556,0.315444
2,DirichletLM,0.424,0.418,0.26594
3,DPH,0.608,0.55,0.288925


####RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD - VB Removal

In [50]:
def remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD_VB(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'RB' or  e[1] == 'MD' or  e[1] == 'PRP' or  e[1] == 'CD' or  e[1] == 'FW' or  e[1] == 'DT' or  e[1] == 'JJS' or  e[1] == 'JJR' or  e[1] == 'VBG' or  e[1] == 'VBD' or  e[1] == 'VB':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [51]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD_VB(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_JJS_JJR_VBG_VBD_VB(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('infected', 'VBD')

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed
('testing', 'VBG')
('developed', 'VBD')

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19
('testing', 'VBG')
('led', 'VBD')

 covid19 affected canada
('affected', 'VBD')

 social distancing impact slowing spread covid19
('distancing', 'VBG')
('slowing', 'VBG')

 guidelines triaging patients infected coronavirus
('triaging', 'VBG')

 best practices hospitals home maintaining quarantine
('best', 'JJS')
('maintaining', 'VBG')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body
('long', 'RB')

 long coronavirus remain stable surfaces
('long', 'RB')

 clinical trials available coronavirus

 best masks preventi

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [52]:
results = experiment(queries_desc)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD - VB Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.68,0.626,0.384021
1,bm25,0.66,0.622,0.385984
2,DirichletLM,0.512,0.498,0.344196
3,DPH,0.648,0.616,0.36871


#####Narrative Queries

In [53]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - JJS - JJR - VBG - VBD - VB Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.62,0.572,0.308951
1,bm25,0.604,0.566,0.315368
2,DirichletLM,0.42,0.416,0.267754
3,DPH,0.596,0.564,0.289663


####RB - MD - PRP - CD - FW - DT - VBG - VBD - VB Removal

In [54]:
def remove_RB_MD_PRP_CD_FW_DT_VBG_VBD_VB(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'RB' or  e[1] == 'MD' or  e[1] == 'PRP' or  e[1] == 'CD' or  e[1] == 'FW' or  e[1] == 'DT' or  e[1] == 'VBG' or  e[1] == 'VBD' or  e[1] == 'VB':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)

In [55]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_VBG_VBD_VB(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_RB_MD_PRP_CD_FW_DT_VBG_VBD_VB(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('infected', 'VBD')

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed
('testing', 'VBG')
('developed', 'VBD')

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19
('testing', 'VBG')
('led', 'VBD')

 covid19 affected canada
('affected', 'VBD')

 social distancing impact slowing spread covid19
('distancing', 'VBG')
('slowing', 'VBG')

 guidelines triaging patients infected coronavirus
('triaging', 'VBG')

 best practices hospitals home maintaining quarantine
('maintaining', 'VBG')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body
('long', 'RB')

 long coronavirus remain stable surfaces
('long', 'RB')

 clinical trials available coronavirus

 best masks preventing infection cov

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [56]:
results = experiment(queries_desc)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - VBG - VBD - VB Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.672,0.616,0.382263
1,bm25,0.656,0.608,0.384727
2,DirichletLM,0.5,0.49,0.343002
3,DPH,0.636,0.606,0.365732


#####Narrative Queries

In [57]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "RB - MD - PRP - CD - FW - DT - VBG - VBD - VB Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.628,0.572,0.309584
1,bm25,0.616,0.566,0.31583
2,DirichletLM,0.432,0.422,0.267811
3,DPH,0.608,0.568,0.29062


**This is the best result obtained for the narrative queries**

####MD - VBG - VBD - VB Removal

In [58]:
def remove_MD_VBG_VBD_VB(text): 
  new = []

  result = TextBlob(text)
  print('\n',text)
  
  for e in result.tags:
    if e[1] == 'MD' or  e[1] == 'VBG' or  e[1] == 'VBD' or  e[1] == 'VB':
      new.append('')
      print(e)
    else:
      new.append(e[0])
 
  return ' '.join(new)
  

In [59]:
#Reset to original queries
queries_adhoc, queries_desc, queries_narrative = reset_queries()

# Modify queries
queries_desc['query'] = queries_desc['query'].apply(lambda x: remove_MD_VBG_VBD_VB(x))
queries_narrative['query'] = queries_narrative['query'].apply(lambda x: remove_MD_VBG_VBD_VB(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """



 origin covid19

 coronavirus respond changes weather

 sarscov2 infected people develop immunity cross protection possible
('infected', 'VBD')

 causes death covid19

 drugs active sars cov sarscov2 animal studies

 types rapid testing covid19 developed
('testing', 'VBG')
('developed', 'VBD')

 serological tests detect antibodies coronavirus

 lack testing availability led underreporting true incidence covid19
('testing', 'VBG')
('led', 'VBD')

 covid19 affected canada
('affected', 'VBD')

 social distancing impact slowing spread covid19
('distancing', 'VBG')
('slowing', 'VBG')

 guidelines triaging patients infected coronavirus
('triaging', 'VBG')

 best practices hospitals home maintaining quarantine
('maintaining', 'VBG')

 transmission routes coronavirus

 evidence related covid19 super spreaders

 long coronavirus live outside body

 long coronavirus remain stable surfaces

 clinical trials available coronavirus

 best masks preventing infection covid19
('preventing', 'VBG')

 t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


#####Description Queries

In [60]:
results = experiment(queries_desc)
results.to_csv(base_dir + "MD - VBG - VBD - VB Removal_desc.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.672,0.612,0.381283
1,bm25,0.652,0.604,0.383781
2,DirichletLM,0.5,0.488,0.342418
3,DPH,0.632,0.602,0.364791


#####Narrative Queries

In [61]:
results = experiment(queries_narrative)
results.to_csv(base_dir + "MD - VBG - VBD - VB Removal_nar.csv", index = False)

Unnamed: 0,name,P@5,P@10,ndcg
0,tf-idf,0.62,0.572,0.308956
1,bm25,0.616,0.56,0.315334
2,DirichletLM,0.424,0.42,0.266083
3,DPH,0.608,0.556,0.289219
