# Extracting the files

Make sure that you have loaded the zipped file in Google Drive before executing anything. Here the files are extracted (output is cleared to reserve space). This cell should be executed only once.

In [None]:
!mkdir '/content/gdrive/My Drive/cord19_data'
!tar -xvzf  '/content/gdrive/My Drive/comm_use_subset.tar.gz' -C '/content/gdrive/My Drive/cord19_data'

# Installing the Transformers

Output is cleared to reserve space

In [None]:
!pip install -U sentence-transformers

# Mounting content from Google Drive.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Question 1: Title Retrieval

# Preprocessing

In [None]:
import json
from sentence_transformers import util, SentenceTransformer, CrossEncoder
import os
import time
from pprint import pprint

#Getting the path of the CORD-19 folder.
cord19_path = '/content/gdrive/My Drive/cord19_data/comm_use_subset'

#A final.json file will keep the relevant sections of every article.
with open('/content/gdrive/My Drive/final.json', 'w') as outfile:
  result = []
  for filename in os.listdir(cord19_path):
    
    #Opening a file from the folder and loading the JSON data.
    open_file = cord19_path + '/' + filename
    with open(open_file, 'r') as f:
      article = json.load(f)

    #Getting the id.
    my_dict = {}
    my_dict['paper_id'] = article['paper_id']
    
    #Separating each section in 'Abstract' by the paper_id and \n.
    abs_text = ''
    for elem in article['abstract']:
      abs_text += elem['text'] + ' p_id:' + article['paper_id'] + '\n '

    #Same thing for the 'Body Text'.
    main_text = ''
    for elem in article['body_text']:
      main_text += elem['text'] + ' p_id:' + article['paper_id'] + '\n '
    
    my_dict['abstract'] = abs_text
    my_dict['body_text'] = main_text

    #Each JSON is inserted into a list.
    result.append(my_dict)

  #Finally the list with all the JSON's is written out to final.json.
  json.dump(result, outfile)

Important note: If the final.json is not created after executing the last cell, try to execute it again. Sometimes it bugs out for some reason.

# Getting the passages

In [None]:
import torch

#Getting each passage by separating them with the '\n' character and adding them in the passages list.
passages = []
with open('/content/gdrive/My Drive/final.json', 'r') as fin:
      data = json.load(fin)
      for d in data: 
        paragraphs = d['abstract'].split("\n")
        for p in paragraphs:
            passages.append(p)
        paragraphs = d['body_text'].split("\n")
        for p in paragraphs:
            passages.append(p)

# Loading the pre-trained models.

In [None]:
bi_encoder_1 = SentenceTransformer('stsb-roberta-large')
bi_encoder_2 = SentenceTransformer('msmarco-distilroberta-base-v2')
bi_encoder_3 = SentenceTransformer('msmarco-roberta-base-v2')

cross_encoder_1 = CrossEncoder('cross-encoder/stsb-TinyBERT-L-4')
cross_encoder_2 = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-4')
cross_encoder_3 = CrossEncoder('cross-encoder/ms-marco-electra-base')

100%|██████████| 1.31G/1.31G [00:47<00:00, 27.8MB/s]
100%|██████████| 305M/305M [00:12<00:00, 24.3MB/s]
100%|██████████| 463M/463M [00:16<00:00, 27.4MB/s]


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=647.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=57436041.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=517.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=612.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=57436041.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=541.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=730.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=438022601.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=316.0, style=ProgressStyle(description_…




# Defining our questions.

In [None]:
questions = ['What are the coronoviruses?', 'What was discovered in Wuhan in December 2019?', 'What is Coronovirus Disease 2019?',
             'What is COVID-19?', 'What is caused by SARS-COV2?', 'How is COVID-19 spread?', 'Where was COVID-19 discovered?',
             'How does coronavirus spread?', 'How can the spread of COVID-19 be prevented?', 'How many coronaviruses are there?',
             'Is COVID-19 related to SARS?', 'What are the symptoms of COVID-19?', 'How can COVID-19 be cured?',
             'How many cases of COVID-19?', 'Is COVID-19 a pandemic?', 'What is the financial impact of COVID-19?',
             'Where does the name coronavirus come from?']

# Search function used by every model.

In [None]:
def search(query, corpus_embeddings, bi_encoder, cross_encoder):
  
  #Transforming the query. 
  question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
  question_embedding = question_embedding.cuda()

  #Searching relevant passages in the corpus.
  results = util.semantic_search(question_embedding, corpus_embeddings, top_k=3)
  results = results[0]  #Semantic_search returns a list within a list. We need the inner list here. 
  
  
  #The results list returns a score for each corpus id it found. However we prefer to utilize the score from the cross encoder.
  #First we need to get the related passage for each question
  cross_passages = []
  for result in results:
    cross_passages.append([query, passages[result['corpus_id']]])
  
  #Now we get the cross encoder scores.
  cross_scores = cross_encoder.predict(cross_passages)
  

  #Adding the cross scores in the results. 
  for index in range(len(cross_scores)):
      results[index]['cross_score'] = cross_scores[index]
  
  #Sorting the results by the cross-encoder scores.
  results = sorted(results, key=lambda x: x['cross_score'], reverse=True)
  
  print("Question:", query)

  #Removing the '\n' and the paper id from the passage
  passage = passages[results[0]['corpus_id']].replace("\n", " ")
  pass_split = passage.split('p_id:')

  #Opening the file using the id in order to get the title. If there's no title, it prints the paper id instead.
  fil = '/content/gdrive/My Drive/cord19_data/comm_use_subset'+'/'+pass_split[1]+'.json'
  with open(fil,'r') as fin:
    article = json.load(fin)
    if article['metadata']['title'] == '':
      print('Found in Paper ID: {} Score: {}'.format(article['paper_id'], results[0]['cross_score']))
      print()
    else:
      print('Found in Title: {} Score: {}'.format(article['metadata']['title'], results[0]['cross_score'] ))
      print()



# Model 1: ‘stsb-roberta-large’ transformer and ‘cross-encoder/stsb-TinyBERT-L-4’ cross-encoder.


In [None]:
#Getting the embeddings for the first model.
corpus_embeddings_1 = bi_encoder_1.encode(passages, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings_1 = torch.tensor(corpus_embeddings_1).cuda()

HBox(children=(FloatProgress(value=0.0, description='Batches', max=10037.0, style=ProgressStyle(description_wi…




  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Search for every question.
for q in questions:
  search(q, corpus_embeddings_1, bi_encoder_1, cross_encoder_1)

Question: What are the coronoviruses?
Found in Title: Structure and Inhibition of the SARS Coronavirus Envelope Protein Ion Channel Score: 0.2700018882751465

Question: What was discovered in Wuhan in December 2019?
Found in Title: Note from the editors: Don't stop thinking about tomorrow Eurosurveillance editorial team Score: 0.24593226611614227

Question: What is Coronovirus Disease 2019?
Found in Title: pathogens Emergence of Novel Coronavirus 2019-nCoV: Need for Rapid Vaccine and Biologics Development Score: 0.49327367544174194

Question: What is COVID-19?
Found in Title: Systematic Comparison of Two Animal-to-Human Transmitted Human Coronaviruses: SARS-CoV-2 and SARS-CoV Score: 0.40332740545272827

Question: What is caused by SARS-COV2?
Found in Title: Systematic Comparison of Two Animal-to-Human Transmitted Human Coronaviruses: SARS-CoV-2 and SARS-CoV Score: 0.4778614342212677

Question: How is COVID-19 spread?
Found in Title: Systematic Comparison of Two Animal-to-Human Transmit

# Model 2: ‘msmarco-distilroberta-base-v2’ transformer and ‘cross-encoder/ms-marco-TinyBERT-L-4’ cross-encoder.


In [None]:
#Getting the embeddings for the second model.
corpus_embeddings_2 = bi_encoder_2.encode(passages, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings_2 = torch.tensor(corpus_embeddings_2).cuda()

HBox(children=(FloatProgress(value=0.0, description='Batches', max=10037.0, style=ProgressStyle(description_wi…




  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Search for every question.
for q in questions:
  search(q, corpus_embeddings_2, bi_encoder_2, cross_encoder_2)

Question: What are the coronoviruses?
Found in Title: Detection and Characterization of Distinct Alphacoronaviruses in Five Different Bat Species in Denmark Score: 0.0626264214515686

Question: What was discovered in Wuhan in December 2019?
Found in Title: Epidemiological Identification of A Novel Pathogen in Real Time: Analysis of the Atypical Pneumonia Outbreak in Wuhan Score: 0.9783265590667725

Question: What is Coronovirus Disease 2019?
Found in Title: pathogens Emergence of Novel Coronavirus 2019-nCoV: Need for Rapid Vaccine and Biologics Development Score: 0.657667875289917

Question: What is COVID-19?
Found in Title: Early epidemiological analysis of the coronavirus disease 2019 outbreak based on crowdsourced data: a population- level observational study Score: 0.7819083333015442

Question: What is caused by SARS-COV2?
Found in Title: Reverse Genetics of SARS-Related Coronavirus Using Vaccinia Virus-Based Recombination Score: 0.9848011136054993

Question: How is COVID-19 spread

# Model 3: ‘msmarco-roberta-base-v2’ transformer and ‘cross-encoder/ms-marco-electra-base’ cross-encoder.


In [None]:
#Getting the embeddings for the third model.
corpus_embeddings_3 = bi_encoder_3.encode(passages, convert_to_tensor=True, show_progress_bar=True)
corpus_embeddings_3 = torch.tensor(corpus_embeddings_3).cuda()

HBox(children=(FloatProgress(value=0.0, description='Batches', max=10037.0, style=ProgressStyle(description_wi…




  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
#Search for every question.
for q in questions:
  search(q, corpus_embeddings_3, bi_encoder_3, cross_encoder_3)

Question: What are the coronoviruses?
Found in Title: Case Report Neurological Complications of Middle East Respiratory Syndrome Coronavirus: A Report of Two Cases and Review of the Literature Score: 0.003970595076680183

Question: What was discovered in Wuhan in December 2019?
Found in Paper ID: fd28e6d03eef27b0454f13ca539dc1498242a4c2 Score: 0.9935118556022644

Question: What is Coronovirus Disease 2019?
Found in Title: Consensus statement The species Severe acute respiratory syndrome- related coronavirus: classifying 2019-nCoV and naming it SARS-CoV-2 Coronaviridae Study Group of the International Committee on Taxonomy of Viruses* Score: 0.8691845536231995

Question: What is COVID-19?
Found in Paper ID: af000c5a8e181550fd16291e5d4f0f70ca9161a1 Score: 0.9742367267608643

Question: What is caused by SARS-COV2?
Found in Title: Human Coronaviruses: Insights into Environmental Resistance and Its Influence on the Development of New Antiseptic Strategies Score: 0.9557797312736511

Question

# Comparisons
A detailed comparison can be seen at the ReadMe file.

# Question 2: Getting the passage.

# Search function to get the passage.

In [None]:
def pass_search(query, corpus_embeddings, bi_encoder, cross_encoder):
    
  #Transforming the query. 
  question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
  question_embedding = question_embedding.cuda()

  #Searching relevant passages in the corpus.
  results = util.semantic_search(question_embedding, corpus_embeddings, top_k=3)
  results = results[0]  #Semantic_search returns a list within a list. We need the inner list here. 
  
  #The results list returns a score for each corpus id it found. However we prefer to utilize the score from the cross encoder.
  #First we need to get the related passage for each question
  cross_passages = []
  for result in results:
    cross_passages.append([query, passages[result['corpus_id']]])
  
  #Now we get the cross encoder scores.
  cross_scores = cross_encoder.predict(cross_passages)
  
  #Adding the cross scores in the results. 
  for index in range(len(cross_scores)):
      results[index]['cross_score'] = cross_scores[index]
  
  #Sorting the results by the cross-encoder scores.
  results = sorted(results, key=lambda x: x['cross_score'], reverse=True)
  
  print("Question:", query)
  
  #Removing the '\n' and the paper id from the passage
  passage = passages[results[0]['corpus_id']].replace("\n", " ")
  pass_split = passage.split('p_id:')

  #Opening the file using the id in order to get the title. If there's no title, it prints the paper id instead.
  fil = '/content/gdrive/My Drive/cord19_data/comm_use_subset'+'/'+pass_split[1]+'.json'
  with open(fil,'r') as fin:
    article = json.load(fin)
    if article['metadata']['title'] == '':
      print('Found in Paper ID: {} Score: {}'.format(article['paper_id'], results[0]['cross_score']))
      print()
    else:
      print('Found in Title: {} Score: {}'.format(article['metadata']['title'], results[0]['cross_score'] ))
      print()

  #Printing the first answer with the highest score.
  print("Answer: {} Score: {}".format(pass_split[0], results[0]['cross_score']) )
  print()
  
  #Getting the second and third most relevant answers
  print("Alternate answers: ")
  passage2 = passages[results[1]['corpus_id']].replace("\n", " ")
  pass_split2 = passage2.split('p_id:')
  passage3 = passages[results[2]['corpus_id']].replace("\n", " ")
  pass_split3 = passage3.split('p_id:')
  
  print("Second Answer: {} Score: {}".format(pass_split2[0], results[1]['cross_score']) )
  print("Third Answer: {} Score: {}".format(pass_split3[0], results[2]['cross_score']) )
  print()

In [None]:
#Getting the passages for each question.
for q in questions:
  pass_search(q, corpus_embeddings_3, bi_encoder_3, cross_encoder_3)

Question: What are the coronoviruses?
Found in Title: Case Report Neurological Complications of Middle East Respiratory Syndrome Coronavirus: A Report of Two Cases and Review of the Literature Score: 0.003970595076680183

Answer:  Coronaviruses are a family of enveloped, single-stranded, positive-sense RNA viruses that are prevalent in bats and can affect many other species including humans. The name corona denotes the crown-like appearance of the surface projections of the virus under the electron microscope. They may cause respiratory, gastrointestinal, hepatic, and neurological diseases in various species [1] . They are grouped into four different genera which are alpha, beta, gamma, and delta coronaviruses. There are six types of coronaviruses that afflict humans and thus are called human coronaviruses (HCoV) which are HCoV-229E, HCoV-OC43, HCoV-NL63, HCoV-HKU1, SARS-CoV, and MERS-CoV (Table 1 ) [6] . Bats are thought to be the natural reservoir of coronaviruses, and the viruses ca