In [1]:
import spacy
import numpy as np

In [None]:
!python3 -m spacy download en_core_web_lg

In [3]:
nlp = spacy.load('en_core_web_sm')

In [5]:
import json
import os

path = os.path.join('.', 'tf_idf_dataset.json')
with open(path) as f:
  dataset = json.load(f)

In [20]:
tokens = [doc['tokenized_text'] for doc in dataset]

lexicon = list()

for x in tokens:
  lexicon.extend(x)

lexicon = list(set(lexicon))
lexicon = sorted(lexicon)

In [22]:
def term_to_index(term):
  try:
    return lexicon.index(term)
  except ValueError:
    return -1

def index_to_term(index):
  return lexicon[index]

<h1>Building the inverted index</h1>

In [25]:
index = dict()

In [26]:
for term in lexicon:
  index.update({term: list()})
  term_index = term_to_index(term)
  for doc in dataset:
    title = doc['title']
    tf_idf_vec = doc['tf_idf']
    tf_idf = tf_idf_vec[term_index]
    if tf_idf == 0:
      continue
    index[term].append((title, tf_idf))

In [37]:
from spacy.tokens.token import Token

def is_stop_word(token: Token):
  return token.is_stop

def is_punc_or_sym(token: Token):
  part_of_speach = token.pos_
  return (part_of_speach == 'PUNCT' or 
          part_of_speach == 'SYM')
  
def is_punc_or_sym_or_stop(token: Token):
  return is_stop_word(token) or is_punc_or_sym(token)

def tokenize_text(input_string: str):
  spacy_doc = nlp(input_string)
  transformed = [token.lemma_ for token in spacy_doc 
                 if not is_punc_or_sym_or_stop(token)]
  transformed = [lemma.lower() for lemma in transformed]
  return transformed

In [77]:
from collections import defaultdict

def search(text):
  tokenized_text = tokenize_text(text)
  result = list()

  for token in tokenized_text:
    result.extend(index.get(token, list()))

  dd = defaultdict(float)

  for title, score in result:
    dd[title] += score

  result = ((k, v) for k, v in dd.items())
      
  return sorted(result, key=lambda x: x[1], reverse=True)

In [79]:
search('symptoms of swine flu')

[('Swine influenza', 0.25686456251576006),
 ('Spanish flu', 0.06863563392473176),
 ('Cholera', 0.023204159217788324),
 ('HIV/AIDS', 0.021681106296157427),
 ('Pandemic', 0.02127047928297263),
 ('Unified Victim Identification System', 0.012478681179343943),
 ('COVID-19 pandemic', 0.010882570795939484)]

In [70]:
test_queries = [
  "black death", 
  "zoonotic diseases", 
  "swine flu", 
  "cholera transmission", 
  "classification of viruses", 
  "economic impact of pandemics", 
  "pandemic prevention organizations", 
  "spread of infectious diseases", 
  "prevention of viral infections", 
  "common symptoms of coronavirus"
]

In [81]:
for query in test_queries:
  print(f'query: {query}\n')
  for result in search(query):
    print(f'{result[0]}, score: {result[1]}')
  print('\n\n')

query: black death

Pandemic, score: 0.04679883867323402
Cholera, score: 0.014218202730960522
Antonine Plague, score: 0.012935357371776113
Epidemiology of HIV/AIDS, score: 0.011507709233754
Bills of mortality, score: 0.008777563930848076
Spanish flu, score: 0.008559216569384194
1929–1930 psittacosis pandemic, score: 0.008077007185193536
Pandemic Severity Assessment Framework, score: 0.007891754726817538
HIV/AIDS, score: 0.00664248081253368
COVID-19 pandemic, score: 0.0050011701466459975
Swine influenza, score: 0.0045273750801216395



query: zoonotic diseases

Swine influenza, score: 0.03429575303180508
Disease X, score: 0.02311941979912861
Pandemic, score: 0.0220685370809864
Pandemic prevention, score: 0.013486328216158356
HIV/AIDS, score: 0.013121832858964889
Targeted immunization strategies, score: 0.0105545177343848
Science diplomacy and pandemics, score: 0.010221217174351597
HIV/AIDS in Yunnan, score: 0.008443614187507841
Cholera, score: 0.008024922574904145
Antonine Plague, score

In [76]:
from collections import defaultdict

dd = defaultdict(float)

my_dic = [
  ('a', 22),
  ('a', 28),
  ('b', 32),
  ('b', 28),
  ('c', 32),
  ('c', 38),
]

for a, b in my_dic:
  dd[a] += b

print(dd)

defaultdict(<class 'float'>, {'a': 50.0, 'b': 60.0, 'c': 70.0})
