In [14]:
!pip install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/33/cb/0e41fad5b30fd66925e47952ddc720d078bdfd8397584a4873ec1cf590ca/scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl.metadata
  Downloading scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=2.0.0 from https://files.pythonhosted.org/packages/81/12/fd4dea011af9d69e1cad05c75f3f7202cdcbeac9b712eea58ca779a72865/threadpoolctl-3.2.0-py3-none-any.whl.metadata
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Installing collected packages: threadpoolctl, scikit-le

In [17]:
import os
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import shutil

In [18]:
def print_view(TYPE, query_embedding, chunk_embeddings):
    print(f"Inspecting {TYPE} METHOD...")
    
    if TYPE == "MEAN_MAX":
        print("Finding the most simlar words in the chunk for each query word...\n")

        sim = cosine_similarity(query_embedding, chunk_embeddings)

        print("Position\tQuery\t\t\tChunk\t\tsim_score")
        for i, s, qt in zip(np.argmax(sim, axis=1), np.max(sim, axis=1), query_tokens):
            if len(chunk_tokens[i]) < 7 and len(qt) < 8:
                print(f"     {i}) \t{qt}\t\t-->\t{chunk_tokens[i]} \t\t{s}")
            elif len(chunk_tokens[i]) < 3:
                print(f"     {i}) \t{qt}\t-->\t{chunk_tokens[i]} \t\t{s}")
            elif len(chunk_tokens[i]) >= 7 and len(qt) < 8:
                print(f"     {i}) \t{qt}\t\t-->\t{chunk_tokens[i]} \t{s}")
            else:
                print(f"     {i}) \t{qt}\t-->\t{chunk_tokens[i]} \t{s}")

        print(f"\nnp.mean(np.max(sim, axis=1))\tsimilarity score between query and {filename} is {np.mean(np.max(sim, axis=1))}")
        print(f"\nnp.mean(sim)\t\t\tsimilarity score between query and {filename} is {np.mean(sim)}")
    
    elif TYPE == "COMBINE_MEAN":
        similarity = cosine_similarity(np.mean(query_embedding, axis=0).reshape(1, -1),
                                       np.mean(chunk_embeddings, axis=0).reshape(1, -1))
        similarity = np.mean(similarity) # Get the single value out of the array
        
        print(f"\nThe average query embedding and average {filename} embedding is {similarity}")
    
    elif TYPE == "MEAN_MEAN":
        sim = cosine_similarity(query_embedding, chunk_embeddings)
        print(f"\nnp.mean(sim) similarity score between query and {filename} is {np.mean(sim)}")
        
    else:
        print(f"TYPE {TYPE} not found")

#### Find the candidate documents

##### Get the data from the query JSON file
- Either specify the filename or the code will get the most recently written file
- specify tokens and embeddings 
    - "tokenized_query" | "tokenized_query_search" | "tokenized_query_search_less_sw"
    - "query_embedding" | "query_embedding_search" | "query_embedding_search_less_sw"

In [11]:
# Specify the directory path containing query JSON files
query_dir = os.path.join("..", 'query')
query_fname = ''

# Specify the tokens and embeddings
# TOKENS_EMBEDDINGS = "query"
# TOKENS_EMBEDDINGS = "query_search"
TOKENS_EMBEDDINGS = "query_search_less_sw"

if TOKENS_EMBEDDINGS == "query":
    TOKENS = "tokenized_query"
    EMBEDDINGS = "query_embedding"
    
elif TOKENS_EMBEDDINGS == "query_search":
    TOKENS = "tokenized_query_search"
    EMBEDDINGS = "query_embedding_search"
    
elif TOKENS_EMBEDDINGS == "query_search_less_sw":
    TOKENS = "tokenized_query_search_less_sw"
    EMBEDDINGS = "query_embedding_search_less_sw"

In [12]:
# Get the latest query JSON file if the file name is not specified
if query_fname == '': 
    # Get a list of all files in the directory
    query_files = os.listdir(query_dir)

    # Filter out directories and get only files
    query_files = [file for file in query_files if os.path.isfile(os.path.join(query_dir, file))]

    # Sort the files by modification time (latest first)
    query_files.sort(key=lambda x: os.path.getmtime(os.path.join(query_dir, x)), reverse=True)

    # Get the latest file
    query_fname = os.path.join(query_dir, query_files[0])

# Open and read the JSON file
with open(query_fname, 'r') as json_file:
    query_data = json.load(json_file)

# Now 'data' contains the content of the JSON file as a Python dictionary or list
query_embedding = np.array(query_data[EMBEDDINGS])
query_tokens = np.array(query_data[TOKENS])

# remove the paddings from the query
query_embedding = np.array([emb for emb, token in zip(query_embedding, query_tokens) if token != '[PAD]'])
#print(query_data['query_embedding'])

##### Create the ouput directory for the candidate doc to be saved to for this query
- If exist, delete

In [13]:
# Instatiate output path
query_file_basename = os.path.basename(query_fname)
query_file_basename = os.path.splitext(query_file_basename)[0]
candidate_docs_fpath = os.path.join("..", 'candidate_docs', query_file_basename)

# Check if the directory exists
if not os.path.exists(candidate_docs_fpath):
    # Create the directory
    os.makedirs(candidate_docs_fpath)
    print(f"Directory '{candidate_docs_fpath}' created.")
else:
    print(f"Directory '{candidate_docs_fpath}' already exists.")
    # Remove all files and subdirectories within the directory
    for item in os.listdir(candidate_docs_fpath):
        item_path = os.path.join(candidate_docs_fpath, item)
        if os.path.isfile(item_path):
            os.remove(item_path)
        elif os.path.isdir(item_path):
            shutil.rmtree(item_path)
    print(f"Contents of directory '{candidate_docs_fpath}' deleted.")

Directory '../candidate_docs/71f39bd6e1740e98a1137e7410d0bef56c40510fd934b7153c5e48428ccb0993' already exists.
Contents of directory '../candidate_docs/71f39bd6e1740e98a1137e7410d0bef56c40510fd934b7153c5e48428ccb0993' deleted.


##### Get the candidate documents 
- Specify number of candidates to get (top_N)
- Specify directory containing the documents to inspect (parsed_docs_dir)
- Specify similarity score method
    - 'MEAN_MEAN': Get the similarity score for each word in the query for a cadidate document by taking the 'average' value of all cosine similarity scores between the word embedding and every word in the candidate docuemnt.  The overall similarity score between the query and each candidate document is the average of the 'average' similarity scores for each word in the query. 
    - 'MEAN_MAX': Get simlarity score for each word in the query for a candidate document by taking the 'maximum' value of all cosine similarity scores between the word embedding and every word in the candidate docuemnt.  The overall similarity score between the query and each candidate document is the average of the 'maximum' similarity scores for each word in the query.
    - 'COMBINE_MEAN': The overall simlarity between a query and a candidate document is the cosime simiilarity beteen the mean of the word embedding of the combined query and the mean word embedding of the document

In [30]:
top_N = 10

# Specify the directory path containing JSON files
parsed_docs_dir = os.path.join("..", "data", 'parsed_cleaned_pdfs')

# METHOD = 'MEAN_MEAN'
# METHOD = 'MEAN_MAX'
METHOD = 'COMBINE_MEAN'

In [31]:
# List to store cosine similarity scores and corresponding document filenames
similarity_scores = []

# Iterate through JSON files in the directory
print(f"Similarity score for method {METHOD} and query {TOKENS_EMBEDDINGS}...")
for filename in os.listdir(parsed_docs_dir):
    if filename.endswith('.json'):
        file_path = os.path.join(parsed_docs_dir, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            chunk_embeddings = np.array(data['token_embeddings'])
            chunk_tokens = np.array(data['tokens'])
            
            # remove the paddingsand unknown tokens from the query
            chunk_embeddings = np.array([emb for emb, token in zip(chunk_embeddings, chunk_tokens) if token not in ['[PAD]', '[UNK]']])
            
            # Calculate cosine similarity between query_embedding and chunk_embeddings METHOD = 'MEAN_MAX'
            if METHOD == 'MEAN_MAX':
                similarity = cosine_similarity(query_embedding, chunk_embeddings)
                similarity = np.mean(np.max(similarity, axis=1))
                
            if METHOD == 'MEAN_MEAN':
                similarity = cosine_similarity(query_embedding, chunk_embeddings)
                similarity = np.mean(similarity)
            
            if METHOD == 'COMBINE_MEAN':
                similarity = cosine_similarity(np.mean(query_embedding, axis=0).reshape(1, -1),
                                               np.mean(chunk_embeddings, axis=0).reshape(1, -1))
                similarity = np.mean(similarity) # Get the single value out of the array
                
            # Store similarity score and filename
            similarity_scores.append((similarity, filename))

# Sort the similarity_scores in descending order based on the similarity score
similarity_scores.sort(reverse=True)

# Get the top 10 candidate documents
top_N_candidates = similarity_scores[:top_N]

# Print the top 10 candidate documents and their similarity scores and copy the files to the candidate docs filepath
for similarity, filename in top_N_candidates:
    print(f"Filename: {filename}, Similarity Score: {similarity}")
    shutil.copy(os.path.join(parsed_docs_dir, filename), candidate_docs_fpath)

Similarity score for method COMBINE_MEAN and query query_search_less_sw...
Filename: 60.json, Similarity Score: 0.9930290757843465
Filename: 5.json, Similarity Score: 0.9926981103769587
Filename: 87.json, Similarity Score: 0.9924822151449495
Filename: 67.json, Similarity Score: 0.9917651729497864
Filename: 22.json, Similarity Score: 0.9901810321509855
Filename: 31.json, Similarity Score: 0.9899576060217182
Filename: 1.json, Similarity Score: 0.98959222350876
Filename: 104.json, Similarity Score: 0.9895214527432853
Filename: 118.json, Similarity Score: 0.9889303445753911
Filename: 25.json, Similarity Score: 0.988912538215978


##### Inspect the similarity scores for individual documents
- Compare similarity scores of top_N documnets with expected answer
- Specify JSON file that was the expected most similar (filename)

In [22]:
print(query_tokens)
query_embedding[1]

['contact' 'manifold']


array([ 0.08611306,  0.06981767, -0.07748413, -0.21296933,  0.10189874,
       -0.17047402, -0.0307296 ,  0.14866479, -0.05853148, -0.06200052,
        0.01911396, -0.16916719, -0.25312904, -0.06233565,  0.04801829,
        0.07684015, -0.20716001, -0.10781612,  0.04202482, -0.40216216,
       -0.05065397,  0.05891727,  0.28971151, -0.01422235, -0.21400681,
       -0.09241208,  0.05955841, -0.17314345,  0.10572264, -0.00863505,
        0.17209627,  0.00451031,  0.2816233 ,  0.13853519, -0.01751108,
       -0.03531506, -0.04603184,  0.0927382 ,  0.12588786,  0.02446925,
        0.18023463, -0.29273313,  0.16953057,  0.10797884,  0.08130017,
        0.04149495,  0.00642308, -0.150754  , -0.01899703,  0.14096196,
       -0.0109875 ,  0.26667914,  0.12112354,  0.0389389 , -0.06663281,
        0.14744867,  0.17541929, -0.07368433,  0.11198942, -0.05517326,
        0.13588025,  0.17368898, -0.23118338,  0.06512529,  0.03348943,
        0.02768652,  0.14706722,  0.23580159, -0.12327185,  0.23

In [27]:

chunk_tokens[11]

'contact'

In [25]:
# Get data for all methods

filename = "2.json"
file_path = os.path.join(parsed_docs_dir, filename)
with open(file_path, 'r') as json_file:
    data = json.load(json_file)
    chunk_embeddings = np.array(data['token_embeddings'])
    chunk_tokens = np.array(data['tokens'])
    
    # remove the paddings from the query
    chunk_embeddings = np.array([emb for emb, token in zip(chunk_embeddings, chunk_tokens) if token not in ['[PAD]', '[UNK]']])

In [26]:
print_view(METHOD, query_embedding, chunk_embeddings)

Inspecting MEAN_MAX METHOD...
Finding the most simlar words in the chunk for each query word...

Position	Query			Chunk		sim_score
     11) 	contact		-->	contact 	0.9999999999999999
     12) 	manifold	-->	manifold 	1.0000000000000002

np.mean(np.max(sim, axis=1))	similarity score between query and 2.json is 1.0

np.mean(sim)			similarity score between query and 2.json is 0.8726485485710108


#### Below code if for testing

In [98]:
from gensim.models import Word2Vec
# Load the trained Word2Vec model
model = Word2Vec.load("word2vec_model.bin")

# Access the embedding of a word
embedding = model.wv[chunk_tokens[2]]
print(embedding)

KeyError: "Key '##var' not present"

In [27]:
# Prooving that element 2 of cosine_similarity output corresponds to cosine_similarity of token 1 in query and token 2 in chunk
num = 0
den_a = 0
den_b = 0 
for a, b in zip(query_embedding[0].reshape(-1, 1), chunk_embeddings[71].reshape(-1, 1)):
    num += a*b
    den_a += a**2
    den_b += b**2
    
den = np.sqrt(den_a) * np.sqrt(den_b)
print(num[0], den[0])
print(num[0]/den)

4.87479382671848 4.88086295121961
[0.99875655]


In [28]:
for a, b in zip(query_embedding[0].reshape(-1, 1), chunk_embeddings[71].reshape(-1, 1)):
    print(a, b)

[0.00979525] [0.02950395]
[0.1801499] [0.23714575]
[-0.11282597] [-0.14750037]
[-0.16389379] [-0.20449817]
[0.12019866] [0.16305223]
[-0.19132757] [-0.24321704]
[0.03743042] [0.04248741]
[0.36185887] [0.45494699]
[-0.18961333] [-0.2351187]
[-0.08775483] [-0.11479293]
[-0.03145975] [-0.03520138]
[-0.43530247] [-0.50552255]
[-0.28460452] [-0.34083328]
[-0.01264222] [-0.02899849]
[0.11428754] [0.14555983]
[-0.02115079] [-0.01415997]
[-0.10840299] [-0.12329137]
[-0.17891493] [-0.20056075]
[0.11752865] [0.12509882]
[-0.79247826] [-0.95711064]
[0.0254809] [0.02256134]
[0.17190607] [0.19424245]
[0.37160259] [0.44927961]
[0.00258394] [0.00096512]
[-0.26391593] [-0.31885713]
[-0.13601364] [-0.15838972]
[-0.06832439] [-0.08720339]
[-0.30661389] [-0.36412066]
[0.06496375] [0.08291019]
[-0.00426452] [0.00857907]
[0.2630567] [0.31602088]
[0.06148953] [0.06348988]
[0.30864471] [0.36563432]
[-0.08815695] [-0.10643951]
[-0.03167176] [-0.05894051]
[0.18723206] [0.21207635]
[0.01943905] [0.01611895]
[0.