In [1]:
#pip install s3fs
#pip install faiss
#pip install pickle4
#https://towardsdatascience.com/how-to-build-a-semantic-search-engine-with-transformers-and-faiss-dcbea307a0e8

In [1]:
import pandas as pd
import s3fs
import numpy as np


In [2]:
import torch
from sentence_transformers import SentenceTransformer

In [3]:
import faiss
import numpy as np
import pickle

In [4]:
def vector_search(query, model, index, num_results=10):
    """Tranforms query to vector using a pretrained, sentence-level 
    DistilBERT model and finds similar vectors using FAISS.
    Args:
        query (str): User query that should be more than a sentence long.
        model (sentence_transformers.SentenceTransformer.SentenceTransformer)
        index (`numpy.ndarray`): FAISS index that needs to be deserialized.
        num_results (int): Number of results to return.
    Returns:
        D (:obj:`numpy.array` of `float`): Distance between results and query.
        I (:obj:`numpy.array` of `int`): Paper ID of the results.
    
    """
    vector = model.encode(list(query))
    D, I = index.search(np.array(vector).astype("float32"), k=num_results)
    return D, I

In [5]:

def id2details(df, I, column):
    """Returns the paper titles based on the paper index."""
    return [list(df[df.id == idx][column]) for idx in I[0]]


In [6]:
# Instantiate the sentence-level DistilBERT
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


In [41]:
# This section reads from the files, encodes the content into vectors

import os
# directory = '''D:\\nazim\\STUDIES\\HKU\\FYP'''
directory = './text_files/txt'
# list to store content and names
fileContents = []
fileNames = [] 

arr =  []
count = 0 
for file in os.listdir(directory):
    if file.endswith(".txt"):
#         print("File: ", os.path.join(directory, file))
        fileNames.append(os.path.join(directory, file))

        # read in the contents of the file
        f = open(os.path.join(directory, file), "r", encoding="utf8")
        fileContent = f.read()
        if len(fileContent) > 10000:
            fileContent = fileContent[:10000]
        fileContents.append(fileContent)
        f.close()
        arr.append([count, file, fileContent])
        count+=1


# print( arr)
print("Number of files read: ", len(fileContents))

# # Each sentence is encoded as a 1-D vector with 768 columns
sentences = fileContents
sentence_embeddings = model.encode(sentences)


# print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))

# print('Sample BERT embedding vector - note includes negative values', sentence_embeddings[0])

Number of files read:  1004


In [42]:
df = pd.DataFrame(arr, columns = ['id','Name', 'Content']) 
print (df.head())

   id                  Name                                            Content
0   0   NASDAQ_WDC_1996.txt                                    WD 1996 Annu...
1   1  NASDAQ_PANL_2017.txt                                           2017 ...
2   2  NASDAQ_ATVI_2018.txt                                        2018 Ann...
3   3     NYSE_VHC_2016.txt                                     TABLE OF CO...
4   4  NASDAQ_ADBE_2010.txt                                                ...


In [43]:
# Step 1: Change data type
embeddings = np.array([embedding for embedding in sentence_embeddings]).astype("float32")

# Step 2: Instantiate the index
index = faiss.IndexFlatL2(embeddings.shape[1])

# Step 3: Pass the index to IndexIDMap
index = faiss.IndexIDMap(index)

# Step 4: Add vectors and their IDs
index.add_with_ids(embeddings, df.id.values)

print(f"Number of vectors in the Faiss index: {index.ntotal}")

Number of vectors in the Faiss index: 1004


In [59]:
user_query = "Protel"

In [60]:
D, I = vector_search([user_query], model, index, num_results=10)
print(f'L2 distance: {D.flatten().tolist()}\n\nMAG paper IDs: {I.flatten().tolist()}')

L2 distance: [210.44383239746094, 210.6932373046875, 212.3014373779297, 224.2246551513672, 226.56423950195312, 226.76766967773438, 227.11349487304688, 228.408203125, 231.75613403320312, 233.5384521484375]

MAG paper IDs: [207, 500, 151, 217, 423, 867, 345, 836, 767, 637]


In [40]:
df.head()

Unnamed: 0,id,Name,Content
0,0,Business strategy.txt,The Company’s objective is to be the leading g...
1,1,Company dividend.txt,I am delighted to welcome you as\nshareholders...
2,2,tax.txt,"On October 22, 2004, the American Jobs Creatio..."


In [61]:
id2details(df, I, 'Name')

[['NASDAQ_VSAT_2002.txt'],
 ['NASDAQ_XLNX_2000.txt'],
 ['NASDAQ_VSAT_2001.txt'],
 ['NYSE_DOX_2002.txt'],
 ['NASDAQ_ANSS_2001.txt'],
 ['NYSE_AMT_2015.txt'],
 ['NYSE_AMD_1999.txt'],
 ['NYSE_AMT_2005.txt'],
 ['NYSE_USM_2013.txt'],
 ['NYSE_USM_1999.txt']]