In [None]:
import pandas as pd
import BRetrieval as bre
import time

# SOME PERFORMANCE TESTS
# 1. Which phase of building the IR system takes the most time?
# 2. Which kind of queries takes the most time?
# 3. Do I improve efficiency with Btree in a meaningful way?


# Create the object

data = pd.read_csv('dataset/news_summary.csv', encoding='latin') 
input_column = 'text'

brs = bre.BRetrievalSystem(data, input_column, 1000)

In [None]:
# 1
#Functions time
#I call again the functions to initialize variables
#I measure the time for each phase: preprocessing, inverted index building, btree building
start = time.time()
for i in range(0,5):
    brs.data = brs.preprocessing()
end = time.time()
print("PREPROCESSING TIME ELAPSED: " + str(end - start))
print('')

start = time.time()
for i in range(0,5):
    brs.inv_index_pos = brs.inv_index_pos_builder()
end = time.time()
print("BUILDING POSITIONAL INVERTED INDEX TIME ELAPSED: " + str(end - start))
print('')

start = time.time()
for i in range(0,5):
    brs.btree = brs.btree_builder()
end = time.time()
print("BUILDING BTREE TIME ELAPSED: " + str(end - start))
print('')


#Results: Positional Inverted Index is the highest


In [None]:
# 2

# I already know that in this index the term 'zulpikar' is in the last terms (to increase complexity)

#Create a big query with OR (higher complexity than AND) and NOT operations 
query = ''
piece = 'zulpikar'
for i in range(0,50):
    query = query + '^'
    query = query + piece
    query = query + ' | '
query = query + piece    

start = time.time()
for i in range(0,5):
    brs.print_result(query)
end = time.time()
#I should obtain all the documents in the dataset
#Of course the program will print a lot


In [None]:
print("QUERY TIME ELAPSED IN ANSWERING QUERY (normal terms): " + str(end - start))
print('') 

# I repeated the query 5 times like in the other phases
# The time of Building the inverted index is still higher than query performing

In [None]:
#What happens with wildcards?
#Create a big query with OR (higher complexity than AND) and NOT operations 
query = ''
piece = '*zulpikar*'
for i in range(0,50):
    query = query + '^'
    query = query + piece
    query = query + ' | '
query = query + piece    

start = time.time()
for i in range(0,5):
    brs.print_result(query)
end = time.time()
#I should obtain all the documents in the dataset
#(Of course the program will print a lot)


In [None]:
print("QUERY TIME ELAPSED IN ANSWERING QUERY (wildcards): " + str(end - start))
print('') 

In [None]:
# 3 
# TIME OF: BTREE SEARCH VS LINEAR SEARCH (I repeat 50 times the function)

new_dataset = pd.DataFrame(columns=['term','docId','rotations'])


start = time.time()
for i in range(0,50):
    term = brs.btree.search_key('zulpikar', new_dataset) #btree search (log complexity)
end = time.time()

print('Term found?')
print(term1['term'] == 'zulpikar')

timebtree = end - start
print("QUERY TIME ELAPSED IN BTREE SEARCH: " + str(timebtree))
print('') 

In [None]:
column = brs.inv_index_pos['term']
i = 0

start = time.time()
for i in range(0,50):
    while (i < len(column) and column[i]!='zulpikar'): #linear search into the inverted index (linear complxity)
        i+=1
end = time.time()    

print('Term found?')
print(column[i] == 'zulpikar')

timeindex = end - start
print("QUERY TIME ELAPSED IN INVERTED INDEX SEARCH: " + str(timeindex))
print('') 

In [None]:
#Conclusions:

#1 Building of the Inverted Index spends the most time, but of course preprocessing phase "takes its time"
#2 Time for answering wildcard queries increases a lot
#3 Time for searching linearly in the inverted index is pretty higher. The use of Btree increases the efficiency a lot

#Possible (future) improvements? 
#Of course keep the Btree to improve efficiency, chosing the right t (n = 2t-1)
#Try to find less expensive ways to build the inverted index and reduce preprocessing phase if it takes too much time. 