In [4]:
# loads the optimised model for creating vector embeddings
import pickle

# load the model
bert_optimised = pickle.load(open("./../../BERT/roberta_optimised", 'rb'))


In [9]:
# Reads the text documents from the directory, encodes the content into vectors
import numpy as np
import os

# define directory
directory = os.path.abspath('''./../../Dataset/Final dataset/all_documents_text''')

# list to store content and names
fileContents = []
fileNames = [] 

# iterate through all files
for file in os.listdir(directory):
    # add file path to the fileNames array
    fileNames.append(os.path.join(directory, file))

    # read in the contents of the file
    f = open(os.path.join(directory, file), "r", encoding="utf8")
    fileContent = f.read()
    
    # append to the contents array
    fileContents.append(fileContent)
    f.close()

# check the numbers of files that have been read from thte direcotry
print("Number of files read: ", len(fileNames))


Number of files read:  3027


In [14]:
# Connents to the database, and inputs all the new documents 
# into the database so that they can be searched through the database
# This section is only run when new documents are to be added to the database
# Currently, there is no option of adding files from the front-end. We add it 
# manually from the backend. In the future, there can be an admin option that
# calls this section to insert the new documents into the database

import django
import os.path

# set the environment variables needed for access
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'rest.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

from django.db.models import Max 
from docs.models import doc_information, doc_embedding

# Finds the corresponding extension for the text file 
# specified after looking it up in the all_documents folder
# which contains all the pdf documents
# @name: name of the text file
# returns the extension of the file if it can find it, or an empty string otherwise
def findExtension(name):
    doc_dir = "./../../Dataset/Final dataset/all_documents"
    docs = os.listdir(doc_dir)
    
    for doc in docs:
        doc_name = os.path.splitext(doc)[0]
        ext_name = os.path.splitext(doc)[1]
        if name == doc_name:
            return ext_name
    return ""

new_dir = "http://127.0.0.1:8080/"


# populates the database using the file content and information 
# received in the previou section. It checks if the file already
# exists in the database and records the file if it doesn't already exist
# new_dir is the address of server where the documents are
# hosted and an be accessed
def populateDatabase():
    
    # get all the current documents in the table
    all_doc_information = doc_information.objects.all()
    all_doc_embedding = doc_embedding.objects.all()

    # gets the id of the last doc entered
    last_id = all_doc_information.aggregate(Max('doc_id'))
    last_id = last_id['doc_id__max']
    if last_id == None:
        last_id = 0
    
    # input the docs into the database
    entryCount = 0
    totalFound = 0 
    for i in range(len(fileContents)):
        foundCount = 0 # flag for checking if a document is already in the databasse

        # extract the name of the file from the path
        path_of_file = fileNames[i]
        name_of_file = os.path.basename(path_of_file)
        name_of_file = os.path.splitext(name_of_file)[0]

        # find the content and create the preview
        file_content = fileContents[i]
        file_content = file_content.strip()
        file_preview = file_content[:300] + "..."

        # check whether the documents already exists
        # set flag foundCount = 1 if it exists
        for doc in all_doc_information:
            if doc.document_name == name_of_file:
                foundCount = 1
                totalFound+= 1
                print("Already Found: ", totalFound)
                break

        # if the document does not exist
        if foundCount == 0:
            
            # find the path
            new_path = new_dir + name_of_file + findExtension(name_of_file)
            # add the doc to databasse
            new_doc_information = doc_information(document_name = name_of_file, filepath=new_path, preview = file_preview)
            new_doc_information.save()

            # update the last id
            last_id = last_id + 1

            # set the size of the characters for each embedding
            cutsize = 1000
            # divide into sections of 1000 characters
            count = 0
            for i in range(len(file_content)//cutsize):
                count += 1
                content = file_content[i*cutsize:(i+1)*cutsize]
                # embed the section of the content
                embedding = bert_optimised.encode(content).tobytes()
                new_doc_embedding = doc_embedding(doc_id = new_doc_information, embeddings = embedding)
                new_doc_embedding.save()

            # last section of the content
            last_start = len(file_content) - len(file_content)%cutsize
            last_content = file_content[last_start:]
            count +=1
            embedding = bert_optimised.encode(last_content).tobytes()
            new_doc_embedding = doc_embedding(doc_id = new_doc_information, embeddings = embedding)
            new_doc_embedding.save()
            entryCount += 1
            print("Entered: ", entryCount)
            print("Number of embeddings: ", count)

populateDatabase()

In [8]:
# gets individual embedded vectors, creates a np array of embedded vectors
# and adds it to embedding_dictionary for easy access

# define empty arrays to hold embeddings and filenames respectively
extracted_embeddings = []
fileNames = []

# get all the documents information and embeddings
all_doc_information = doc_information.objects.all()
all_doc_embedding = doc_embedding.objects.all()

# define empty dictionary to store information from database
# for easy access
embedding_dictionary = {}

# for every document in database
for doc in all_doc_information:
    # get all the embeddings for that document
    embedding_docs = all_doc_embedding.filter(doc_id=doc.doc_id)
    temp_arr = []
    for embedding in embedding_docs:
        # covert the embedding from binary to np array of type float32
        extracted_embedding = np.frombuffer(embedding.embeddings, dtype = "float32")
        # add the embedding to the array of embeddings
        temp_arr.append(extracted_embedding)
        
    # convert the whole embeddings array into an np array 
    # which is needed for comparison
    temp_arr = np.array(temp_arr) 
    # add the array of embeddings for that document
    embedding_dictionary[doc.doc_id] = temp_arr
    

In [None]:
# This section is used for testing on the backend
# it will not be executed during running the semantic search
# rather, it emulates the functionality of the searching queries many different times
# with different thresholds to find the optimal threshold for actual search

import scipy
import time
import numpy as np

# filters the distances in a given array according to a threshold
# @arr: array to filter
# @threshold: a threshold to find the arrays
# returns all elements below the threshold, if the resulting array
# is an empty array, returns the original array
def filterDistances(arr, t):
    new_arr = []
    for element in arr:
        if element < t:
            new_arr.append(element)

    if len(new_arr) == 0:
        return arr
    return new_arr

# define query
queries= ["Machine learning developments in the industry",
         "What are some of the cybersecurity risks faced by the company?",
         "How likely are we to go into financial crisis?",
         "Does technology improve company performance?",
         "How to implement robotic process automation?",
         "How to utilise machine learning?",
         "What are the major risks of the company?",
         "What was the financial performance?",
         "What are the accounting practices of the company?",
         "What are the new trends in artificial intelligence?",
         "What is our business strategy?"]

# Find the closest 10 sentences of the corpus for each query sentence based on cosine similarity
number_top_matches = 10 #@param {type: "number"}

# show the results
thresholds = np.linspace(0,1,101)

averages_using_different_thresholds = []
# @embedding_dictionary: map of every doc:embeddings
for t in thresholds:
    average_using_same_threshold_different_queries = []
    for query in queries:
        query_embedding = bert_optimised.encode(query)

        results = [] # average distance of EVERY DOCUMENT from query

        # finds the average distance of every document from query and add to results
        # @doc: doc_id of every document
        for doc in embedding_dictionary:
            # extracted_embeddings: array of embeddings
            extracted_embeddings = embedding_dictionary[doc]
            
            #distance of query from every embedding
            # @distances: distance of EVERY EMBEDDING in ONE doc
            distances = scipy.spatial.distance.cdist([query_embedding], extracted_embeddings, "cosine")[0]
            distances = filterDistances(distances, t)

            # @avg_distance: average of distances under the threshold
            average_distance = sum(distances)/len(distances)

            results.append(average_distance)


        # sorted results of all documents for that query
        results = sorted(results)
        results = results[:10]
        
        # @result_average: the average result of top10 SEARCH RESULTS using 
        # that threshold for that query
        result_average = sum(results)/len(results)
        average_using_same_threshold_different_queries.append(result_average)

    average_of_all_queries = sum(average_using_same_threshold_different_queries)/len(average_using_same_threshold_different_queries)
    averages_using_different_thresholds.append(average_of_all_queries)

min_threshold = thresholds[np.argmin(averages_using_different_thresholds)]
print("Threshold: ", min_threshold)


In [13]:
import matplotlib.pyplot as plt
import numpy as np


# x axis values
x = thresholds
# corresponding y axis values
y = averages_using_different_thresholds

# plotting the points 
plt.plot(x, y)
  
# naming the x axis
plt.xlabel('thresholds')
# naming the y axis
plt.ylabel('averages of top 10 results')
  
# giving a title to my graph
plt.title('Average-Threshold graph')
  
# function to show the plot
plt.show()

In [12]:
import scipy



# define query
query = 'assessment of performance of Vodafone'
query = query.strip()
min_threshold = 0.51


# filters the distances in a given array according to a threshold
# @arr: array to filter
# @threshold: a threshold to find the arrays
# returns all elements below the threshold, if the resulting array
# is an empty array, returns the original array
def filterDistances(arr, t):
    new_arr = []
    for element in arr:
        if element < t:
            new_arr.append(element)

    if len(new_arr) == 0:
        return arr
    return new_arr

def run_query(query):

    
    query_embedding = bert_optimised.encode(query)

    # Find the closest 50 sentences of the (for the test dataset)
    number_top_matches = 50 #@param {type: "number"}

    results = []
    for doc in embedding_dictionary:
        extracted_embeddings = embedding_dictionary[doc]
        #distance of query from every embedding
        distances = scipy.spatial.distance.cdist([query_embedding], extracted_embeddings, "cosine")[0]
        distances = filterDistances(distances, min_threshold)

        average_distance = sum(distances)/len(distances)

        result = (doc, average_distance)
        results.append(result)

    # sort
    results = sorted(results, key=lambda x: x[1])

    response_docs = [query]
    i = 1
    for docid, distance in results[0:number_top_matches]:
        doc = doc_information.objects.get(doc_id=docid)
        response_doc = (doc.document_name)

        response_docs.append(response_doc)
        i += 1

    return response_docs

run_query(query)

In [10]:
import pandas as pd
import xlsxwriter
# reads the queries from the excel file
directory_excel = os.path.abspath('''./../confusion_matrix/data2.xlsx''')
queries = pd.read_excel(directory_excel, index_col=0)

# runs the queries, adding it to all_results
all_results = []
for query, row in queries.iterrows():
    result = run_query(query)
    all_results.append(result)

D:\nazim\STUDIES\HKU\FYP\confusion_matrix\data2.xlsx


In [11]:
# writes all results the query_results
df = pd.DataFrame(all_results).T
directory_excel = os.path.abspath('''./../confusion_matrix/query_results.xlsx''')
df.to_excel(excel_writer = directory_excel)