In [21]:
import pandas as pd


In [22]:
#this sheet consists of labelled data
labelled_data = pd.read_excel("data.xlsx", sheet_name="data")
#this sheet consists of the query results after running the search
query_results = pd.read_excel("query_results_bert_OG_NOV.xlsx", index_col=0)

In [23]:
def removeExtension(files):
    for i in range (0, len(files)):
        if (type(files[i])==str):
            files[i] = files[i].replace(".txt","")
    return files

In [24]:
import os

# define directory
directory = os.path.abspath('''./../test_dataset_text''')

# list to store content and names
fileNames = [] 
for file in os.listdir(directory):
    fileNames.append(file)
    
#remove extensions from filenames 
fileNames = removeExtension(fileNames)

In [25]:
# elements which are in arr1 but not in arr2. Returns array with elements not found in arr2
#@arr1, @arr2: array with file names
def findDiff(arr1, arr2):
    arr2 = arr2.values.tolist()
    final_arr = []

    for a in arr1: 
        if a not in arr2: 
            final_arr.append(a)            
    return final_arr

In [1]:
#stores accuracy of every query 
accuracies = [] 
#stores precision of every query
precisions = [] 
#stores true positive rate of every query
tp_rates = []
#stores false positive rate of every query
fp_rates = [] 
#stores true negative rate of every query
tn_rates = [] 
#stores false negative rate of every query
fn_rates = [] 

#loop through the results of every query
for query, results in query_results.items():
    
    #true positive count of query
    true_positive = 0 
    #false positive count of query
    false_positive = 0  
    #true negative count of query
    true_negative = 0 
    #false negative count of query
    false_negative = 0 
    
    # loop through the expected result of every query 
    for labelled_query, labelled_results in labelled_data.items():
        if (query==labelled_query):
            print("========> Query: ", query)
            labelled_results_arr = labelled_results.values.tolist()
            
            # labelled_results_arr[-1] last element of the list shows how many docs there should be
            # @topN has maximum of 10 or the number of expected positives 
            topN = max(labelled_results_arr[-1], 10)
            
            # results consists of topN files 
            results = results[:topN]
            
            #remove extensions from filenames
            labelled_results_arr = removeExtension(labelled_results_arr)                            
            
            #loop through topN results
            for result in results: 
                #increase true positive count if result found in labelled data
                if (result in labelled_results_arr):
                    true_positive+=1
                #increase false positive count if result not found in labelled data 
                else:
                    false_positive+=1
                    
            #find files which are not in topN results
            other_files = findDiff(fileNames, results)
            for file in other_files: 
                #increase true negative count if file found in other files
                if (file not in labelled_results_arr):
                    true_negative+=1
                #increase false negative count if file not found in other files 
                else: 
                    false_negative+=1
    
    print("True positives: ", true_positive)
    print("False positives: ", false_positive)
    print("True negative: ", true_negative)
    print("False negative: ", false_negative)
    print("Total: ", true_positive+false_positive+true_negative+false_negative)
    
    # Accuracy is the ratio of correctly predicted observation to the total observations.
    # Formula: TP+TN/TP+FP+FN+TN
    accuracy = (true_positive + true_negative)/ (true_positive + true_negative + false_negative+ false_positive)*100
    print("Accuracy: ", accuracy, "%")
    
    # Precision is the ratio of correctly predicted positive observations to the total predicted positive observation
    # Formula TP/TP+FP
    precision = (true_positive/(true_positive+ false_positive))*100
    print("Precision: ", precision, "%")
   
    accuracies.append(accuracy)
    precisions.append(precision)
    
    #True Positive rate: TP/TP+FN
    tp_rates.append(true_positive/(true_positive+false_negative)*100)
    
    #True Negative rate: TN/TN+FP
    tn_rates.append(true_negative/(true_negative+false_positive)*100)
    
    #False Positive rate: FP/FP+TN
    fp_rates.append(false_positive/(false_positive+true_negative)*100)
    
    #False Negative rate: FN/FN+TP
    fn_rates.append(false_negative/(false_negative+true_positive)*100)

print("True positives: ", true_positive)
print("False positives: ", false_positive)
print("True negative: ", true_negative)
print("False negative: ", false_negative)
print("Total: ", true_positive+false_positive+true_negative+false_negative)

In [2]:
import statistics as st

#calculate average of all accuracy scores 
avg_accuracy = st.mean(accuracies)

#calculate average of all precision scores 
avg_precision  = st.mean(precisions)

#calculate average of all true positive rates
avg_tp = st.mean(tp_rates)

#calculate average of all true negative rates
avg_tn = st.mean(tn_rates)

#calculate average of all false positive rates
avg_fp = st.mean(fp_rates)

#calculate average of all false negative rates
avg_fn = st.mean(fn_rates)

print("Average accuracy: ", avg_accuracy, "%")
print("Average precision: ", avg_precision, "%")

print("Average True positive rate: ", avg_tp, "%")
print("Average True negative rate: ", avg_tn, "%")

print("Average False positive rate: ", avg_fp, "%")
print("Average False negative rate: ", avg_fn, "%")
