In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import nltk
import math
import re
import csv
import pickle
import numpy as np
from collections import defaultdict
import pandas as pd

Cleaning up the documents

In [3]:
def separate_doc(path):
    with open(path, 'r') as doc_collection:
        doc = doc_collection.read().split(".I ")
        for i in range(1, len(doc)):
            doc[i] = doc[i].split("\n.S", maxsplit = 1)
    return doc

def doc_preprocess(txt):
  txt = re.sub('[\W_]+', ' ', txt.lower()) # substitutes non-word characters and _ with spaces
  txt = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", txt) # substitutes digits in the beginning, middle, and end of words
  txt = re.sub(r'[\W_]+', ' ', txt) # rechecking for non-word characters
  txt = txt.split()
  return txt

def remove_stopwords(txt, stopwords):
  # word = txt.split()
  filtered_words = [word for word in txt.split() if word not in stopwords] # removes all the stopwords
  # filtered_words = list(filter(lambda x: x not in stopwords, word)) 
  txt = ' '.join(filtered_words)
  return txt

def id_and_content(docs, stopwords):
  clean_docs = {}
  length = len(docs)
  # print(length)
  for i in range(1, length - 1):
    try:
      id = docs[i][0][9:]
      txt = docs[i][1]
    except:
      continue
    txt = remove_stopwords(txt, stopwords)
    txt = doc_preprocess(txt)
    clean_docs[int(id)] = txt
  return clean_docs # returns the docs after all the preprocessing has been done and separates then into id and content dictionary

Making the index and inverted index

In [4]:
def file_index(list_of_words):
  all_docs = {}
  for files, words in list_of_words.items():
    index = {}
    for i, word in enumerate(words):
      if word in index.keys():
        index[word].append(i)
      else:
        index[word] = [i]
    all_docs[files] = index
  return all_docs

def complete_inverted_index(indexx):
  full_inv_index = defaultdict(dict)
  for files, file_idx in indexx.items():
    for word, indices in file_idx.items():
      full_inv_index[word][files] = indices
  return dict(full_inv_index)

Parsing through and cleaning up the queries

In [5]:
def parse_queries(files):
  queries = []
  with open(files, 'r') as f:
    current_query = None
    for line in f:
      line = line[:-1]
      if '<top>' in line:
        current_query = {}
      elif '</top>' in line:
        queries.append(current_query)
        current_query = {}
      elif '<num>' in line:
        current_query['num'] = line.split(':')[1].strip()
      elif '<title>' in line:
        current_query['title'] = line.split('>')[1].strip()
      elif (not '<desc>' in line and len(line) > 2):
        current_query['description'] = line
  queries_all = {}
  length_queries = len(queries)
  for i in range(length_queries):
    queries_all[queries[i]['num']] = queries[i]['description']
  return queries_all

def preprocessing_queries(query, stopwords):
  for i, j in query.items():
    txt = query[i]
    txt = remove_stopwords(txt, stopwords)
    txt = doc_preprocess(txt)
    query[i] = txt
  return query 

In [6]:
def intersection_of_dq(doc, query):
  intersection = set(list(index[doc].keys())) & set(query)
  score = len(list(intersection))
  return score

def scores_top_50(doc_score):
  doc_score_sorted = sorted(list(doc_score.items()), key = lambda item: item[1], reverse = True)
  return doc_score_sorted[:50]

Ranking algorithms

In [7]:
def boolean_ranking(docs, query):
  documents_score = {}
  for doc in docs:
    boolean_score = intersection_of_dq(doc, query)
    documents_score[doc] = boolean_score
  boolean_50 = scores_top_50(documents_score)
  return boolean_50

def tf_ranking(docs, query):
  documents_score = {}
  for doc in docs:
    len_document = len(index[doc])
    frequency = 0
    for term in query:
      term_freq = 0
      if term in index[doc]:
        term_freq = len(index[doc][term])
      frequency += term_freq
    actual_freq = frequency / len_document
    documents_score[doc] = actual_freq
  tf_50 = scores_top_50(documents_score)
  return tf_50

def tf_idf_ranking(docs, query):
  documents_score = {}
  for doc in docs:
    len_document = len(index[doc])
    frequency = 0
    for term in query:
      term_freq = 0
      if term in index[doc]:
        term_freq = len(index[doc][term])
      frequency += term_freq
    actual_freq = frequency / len_document
    inverted_doc_freq = 0
    for term in query:
      if term in inverted_index:
        doc_freq = len(inverted_index[term])
        idf = len(inverted_index) / doc_freq
        idf = math.log(idf)
        inverted_doc_freq +=idf
    tf_idf = actual_freq * inverted_doc_freq
    documents_score[doc] = tf_idf
  tf_idf_50 = scores_top_50(documents_score)
  return tf_idf_50

def custom_ranking(docs, query):
  documents_score = tf_idf_ranking(docs, query)
  documents = [i for i, j in documents_score]
  doc_score = {}
  for doc in documents:
    len_document = len(index[doc])
    frequency = 0
    for term in query:
      term_freq = 0
      if term in index[doc]:
        term_freq = len(index[doc][term])
      frequency += term_freq
    actual_freq = frequency / len_document
    inverted_doc_freq = 0
    for term in query:
      if term in inverted_index:
        doc_freq = len(inverted_index[term])
        idf = len(inverted_index) / doc_freq
        idf = math.log(idf)
        inverted_doc_freq +=idf
    tf_idf = actual_freq * inverted_doc_freq
    new_score = (tf_idf * 2) - math.log(tf_idf)
    doc_score[doc] = new_score
  custom_50 = scores_top_50(doc_score)
  return custom_50

Main

In [None]:
# extending the set of stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(['.U', '.S','.M','.T','.P','.W','.M','.I'])

# getting preprocessing the docs and creating the index and inverted index
docs = separate_doc('/content/drive/MyDrive/CSE 272/HW 1_Search Engine/ohsumed.88-91')
# print(docs)
cleaned_docs = id_and_content(docs, stopwords)


In [9]:
full_index = file_index(cleaned_docs)
full_inverted_index = complete_inverted_index(full_index)

In [10]:
# Using pickles to save the index in binary format
index_binary = open("/content/drive/MyDrive/CSE 272/HW 1_Search Engine/index.pkl", "wb")
pickle.dump(full_index, index_binary)
index_binary.close()

In [11]:
# Using pickles to save the inverted index in binary format
inverted_index_binary = open("/content/drive/MyDrive/CSE 272/HW 1_Search Engine/inverted_index.pkl", "wb")
pickle.dump(full_inverted_index, inverted_index_binary)
inverted_index_binary.close()

In [9]:
index_file = open("/content/drive/MyDrive/CSE 272/HW 1_Search Engine/index.pkl", "rb")
index = pickle.load(index_file)

inv_index_file = open("/content/drive/MyDrive/CSE 272/HW 1_Search Engine/inverted_index.pkl", "rb")
inverted_index = pickle.load(inv_index_file)

In [10]:
# parsing through the queries and preprocessing them
queries = parse_queries('/content/drive/MyDrive/CSE 272/HW 1_Search Engine/query.ohsu.1-63')
all_queries = preprocessing_queries(queries, stopwords)

Creating the log files for each ranking algorithm

In [11]:
def FTQ(query):
  docs = set()
  for t in query:
    if t in inverted_index:
      docs.update(set(inverted_index[t].keys()))
  return list(docs)

def print_ranking_algos(algorithm):
  ranking_function = {
      'Boolean': boolean_ranking,
      'Tf': tf_ranking,
      'Tf_idf': tf_idf_ranking,
      'Custom':custom_ranking,
  }
  with open(algorithm, 'w') as f:
    for q_id, query in queries.items():
      docs = FTQ(query)
      docs_score = ranking_function[algorithm](docs, query)
      for i, (doc_id, score) in enumerate(docs_score):
        f.write(f"{q_id}\tQ0\t{doc_id}\t\t{i+1}\t{score}\t{algorithm}\n")

In [12]:
print_ranking_algos('Boolean')
print_ranking_algos('Tf')
print_ranking_algos('Tf_idf')
print_ranking_algos('Custom')