# Using boolean search / bag of words to search queries on TREC-DL 2020

## Downloading dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir collections

In [None]:
!mkdir collections/msmarco-passage

In [None]:
!wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz -P collections/msmarco-passage

--2023-03-04 23:13:15--  https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
Resolving msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)... 20.150.34.4
Connecting to msmarco.blob.core.windows.net (msmarco.blob.core.windows.net)|20.150.34.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1057717952 (1009M) [application/gzip]
Saving to: ‘collections/msmarco-passage/collectionandqueries.tar.gz’


2023-03-04 23:16:12 (5.73 MB/s) - ‘collections/msmarco-passage/collectionandqueries.tar.gz’ saved [1057717952/1057717952]



In [None]:
!tar xvfz collections/msmarco-passage/collectionandqueries.tar.gz -C collections/msmarco-passage

collection.tsv
qrels.dev.small.tsv
qrels.train.tsv
queries.dev.small.tsv
queries.dev.tsv
queries.eval.small.tsv
queries.eval.tsv
queries.train.tsv


## Preprocessing documents

In [2]:
import pandas as pd

In [3]:
import nltk

nltk.download('stopwords')  # Download stopwords if not already downloaded

from nltk.corpus import stopwords

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


In [5]:
def tokenize_and_remove_stopwords(text):
  tokens = text.split()
  tokens = [token.lower() for token in tokens if (token.lower() not in stop_words)]
  return tokens

In [6]:
from collections import defaultdict

In [11]:
#df_collection = pd.read_csv('collections/msmarco-passage/collection.tsv', sep='\t', header=None)

# set the chunk size
chunk_size = 1000
chunks = []
inverted_index = defaultdict(set)
full_text = ''

def process(row):
  tokenized_text = tokenize_and_remove_stopwords(row[1])
  doc_id = row[0]
  for token in tokenized_text:
    inverted_index[token].add(doc_id)

def process2(row, full_text):
  tokenized_text = tokenize_and_remove_stopwords(row[1])
  new_text = ' '.join(tokenized_text)
  full_text += new_text
  return full_text

chunk_id = 0
# iterate through the file in chunks
for chunk in pd.read_csv('collections/msmarco-passage/collection.tsv', sep='\t', header=None, chunksize=chunk_size):
  # process the chunk here
  if (chunk_id % 1000) == 0:
    print(f'Processing chunk {chunk_id}')
  for index, row in chunk.iterrows():
    #full_text = process2(row, full_text)
    process(row)
  del(chunk)
  chunk_id += 1
  

Processing chunk 0
Processing chunk 1000
Processing chunk 2000
Processing chunk 3000
Processing chunk 4000
Processing chunk 5000
Processing chunk 6000
Processing chunk 7000
Processing chunk 8000


In [12]:
len(inverted_index)

6706455

In [13]:
import pickle

with open('inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_index, f)

In [27]:
#Antes da criação da matriz abaixo, 16GB de RAM estavam sendo ocupados
import sys

object_size = sys.getsizeof(inverted_index)
print(object_size/1000000) #335MB

335.544416


In [25]:
#import numpy as np

#inverted_index_matrix = np.zeros((len(inverted_index), 8_841_823), dtype=bool)

In [26]:
#object_size = sys.getsizeof(inverted_index_matrix)
#print(object_size/1_000_000_000) #335MB