# Embedding Classes

In [1]:
import os
import sys
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [53]:
# os.getcwd()

In [2]:
# Main folder
local_path = '/Users/paulaleonova/repos/'


### Set Up

In [3]:
# In order to locate the embedding classes, need to first add path to notebook
folder_name = "llm-toolbox/embeddings/utils"
sys.path.append(os.path.join(local_path ,folder_name))

In [5]:
# Use os.listdir() to get the list of files and folders in the specified path
contents = os.listdir(os.path.join(local_path ,folder_name))

# Print the list of contents
print(contents)

['embeddingsUtils.py', 'hfTextEmbedder.py']


In [6]:
%%capture
!pip install transformers

In [7]:
%%capture
!pip install torch

In [9]:
from hf_text_embedder import HuggingFaceTextEmbedder
from embeddings_helper import EmbeddingUtilities

Set the autoreload mode to automatically reload all modules:

In [10]:
%load_ext autoreload
%autoreload 2

## Embeddings

### HuggingFaceTextEmbedder

#### Sentence Embeddings

In [11]:
# Create an instance of the EmbeddingGenerator
hf_embedder = HuggingFaceTextEmbedder(hf_model_id='sentence-transformers/all-MiniLM-L6-v2')

# List of input texts
input_text_list = [
    "This is an example sentence.",
    "Another example for demonstration.",
    "Apples and bananas."
    # ...
]

label_text_list = [
     "This is an example sentence slightly modified.",
     "This is an example sentence with minor edits but more than the one above.",
    "Another example for demonstration.",
     "I would like to make a smoothie.",
     "I like to eat strawberries."

    # ...
]

# # Alternatively, here are some sample input and label embeddings dictionaries
# input_embeddings_dict = {
#     "doc1": [np.random.rand(300)],
#     "doc2": [np.random.rand(300)],
#     # ...
# }
# label_embeddings_dict = {
#     "label1": [np.random.rand(300)],
#     "label2": [np.random.rand(300)],
#     # ...
# }

In [12]:
# Generate text embeddings using the selected method
input_embeddings_dict = hf_embedder.generate_text_embeddings_dictionary(input_text_list)
label_embeddings_dict = hf_embedder.generate_text_embeddings_dictionary(label_text_list)

In [13]:
input_embeddings_dict['Another example for demonstration.'][0][0:10]

array([-0.01415277,  0.01300772,  0.0293526 ,  0.03415604,  0.00625622,
       -0.0603661 ,  0.01226946,  0.05472851,  0.0136147 ,  0.00260575],
      dtype=float32)

In [14]:
temp_asset_embeddings_df = pd.DataFrame.from_dict(input_embeddings_dict).T.reset_index()
temp_asset_embeddings_df.head()

Unnamed: 0,index,0
0,This is an example sentence.,"[0.098124586, 0.0678127, 0.06252319, 0.09508479, 0.03664758, -0.003984667, 0.0074776215, -0.013231551, 0.06288383, 0.02249555, 0.07269578, -0.031274315, 0.046355095, -0.012554482, 0.04781471, -0.004910291, 0.049419984, -0.06410924, -0.09696582, 0.03288875, 0.054104373, 0.035328604, 0.033050552, 0.014699387, -0.033430703, -0.02561576, -0.05079216, 0.07325452, 0.11027398, -0.029661827, -0.06755711, -0.030571532, 0.039560232, 0.045476023, 0.015996117, 0.038550448, -0.010954033, 0.08483571, -0.044287052, -0.0067964722, 0.009425666, 5.0782135e-05, 0.00130356, -0.011969791, 0.013645197, -0.08417428, -0.00016514162, 0.00548381, 0.025615087, -0.03154522, -0.10734476, -0.045787808, -0.091175, -0.0025105136, 0.017998427, 0.049401615, 0.006184894, 0.05979631, 0.027002633, -0.016122261, -0.018149786, -0.023634886, -0.09489714, 0.06621632, 0.14922751, 0.02433875, 0.0012101572, 0.006072045, -0.09917042, 0.085058175, 0.030826125, 0.02918668, -0.023955978, -0.010559309, -0.07215206, -0.03525638, 0.033346567, -0.043929502, 0.11975523, 0.0886474, -0.09591898, -0.05908002, -0.008292761, 0.038169134, 0.049799677, -0.0326821, 0.019900993, -0.10837854, 0.014679056, 0.018117832, -0.06131783, -0.08975036, 0.04922166, -0.020870993, 0.00405509, -0.0354302, -0.05657778, -0.048313808, -0.024660246, 0.07359178, ...]"
1,Another example for demonstration.,"[-0.014152773, 0.013007717, 0.0293526, 0.03415604, 0.0062562204, -0.0603661, 0.012269463, 0.05472851, 0.013614697, 0.0026057486, 0.11796647, -0.027089372, 0.07698586, 0.06198628, 0.029565705, -0.056617614, 0.03782706, 0.041235413, -0.060186274, 0.023291046, 0.017145708, -0.063070126, 0.0700303, 0.04362374, -0.04302809, -0.014636709, -0.006448502, 0.086068444, 0.11518197, -0.03237694, -0.0134964045, -0.0045142206, -0.025575861, 0.0018403247, 0.02044835, 0.062755495, 0.042582184, 0.07908137, -0.010691491, -0.0054031797, 0.010071245, -0.082168885, 0.019714111, -0.024979178, 0.004405457, -0.009945068, 0.07508384, 0.06369879, 0.026956625, -0.08783795, -0.046781644, -0.059882328, -0.054540582, -0.08834467, -0.012291625, -0.048845243, 0.011480626, 0.04714837, -0.0018438045, -0.01919519, 0.100144766, -0.019375224, -0.0457732, 0.06336669, 0.08143835, -0.0124925, -0.001725475, 0.041042794, -0.001529728, 0.12939063, 0.0936588, 0.0077793803, -0.06494012, -0.0010249561, -0.043041687, -0.06316234, -0.026067587, 0.039360423, -0.0034083512, -0.027148174, -0.08091353, -0.0844933, -0.036735807, 0.027338076, -0.033189323, 0.058425758, -0.01783426, 0.010297409, -0.056801934, -0.0034585358, -0.037749704, -0.0061463225, -0.09409349, -0.027467424, -0.039021686, -0.08808721, -0.022146022, -0.051426917, -0.050076984, 0.100279085, ...]"
2,Apples and bananas.,"[0.0065595256, 0.013384603, -0.0016563231, 0.07659334, -0.024109604, 0.056960937, 0.04504295, -0.040181436, -0.024177287, 0.02658395, 0.0609222, -0.078352295, 0.002021311, -0.056416884, 0.055212103, 0.049849223, 0.008344827, 0.0063252132, -0.06319301, -0.054161053, 0.051340625, 0.04194075, -0.037720114, -0.009208805, 0.038883932, 0.11359963, 0.004070775, -0.036165174, -0.035352603, -0.07100055, -0.030598167, -0.06761354, 0.040163137, 0.03989932, -0.022909634, -0.016908828, 0.09672891, -0.100502476, 0.0014447625, -0.035038907, 0.021288317, 0.059758067, 0.048544183, 0.020242501, 0.0048312917, 0.036496505, 0.0659347, 0.003125649, 0.086225644, -0.0051938137, 0.014606154, -0.0027698695, -0.086457886, -0.00030967878, -0.0025725302, -0.040149815, -0.028997488, -0.003895286, 0.082568884, 0.06883922, 0.03069305, -0.07903387, -0.031162951, 0.050554473, -0.017181113, -0.07554498, -0.09712153, 0.06591997, -0.034368295, 0.01779498, -0.012579435, 0.03852538, 0.05460457, 0.07715764, -0.07371081, 0.05743588, 0.08379526, -0.088326424, -0.056322195, -0.03149911, -0.09074801, -0.024256757, -0.03583845, -0.008091285, 0.024067061, 0.041889187, -0.058306724, -0.011963186, -0.06557511, 0.071914926, -0.05317682, -0.020122139, 0.10973178, 0.029571498, 0.014680431, -0.024002472, 0.0075219492, -0.11391444, -0.00030092822, 0.12709738, ...]"


### EmbeddingUtilities()

#### Cosine Similarity
Between two dictionaries where the key is the text and the items are embeddings

In [15]:
# Create an instance of EmbeddingUtilities
embedding_util = EmbeddingUtilities()

# Generate cosine similarity DataFrame
cosine_similarity_df = embedding_util.generate_cosine_similarity_df(
    input_embeddings_dict,
    label_embeddings_dict)

cosine_similarity_df[cosine_similarity_df['InputText'] == "Apples and bananas."].sort_values("score", ascending=False)
# cosine_similarity_df.head()


Unnamed: 0,InputText,LabelText,score
4,Apples and bananas.,I like to eat strawberries.,0.450567
3,Apples and bananas.,I would like to make a smoothie.,0.264535
2,Apples and bananas.,Another example for demonstration.,0.167473
0,Apples and bananas.,This is an example sentence slightly modified.,0.136158
1,Apples and bananas.,This is an example sentence with minor edits but more than the one above.,0.094383


#### Append Additional Information

In [16]:
# Example addtional input data
input_input = [
    {"DocText": input_text_list[0], "AdditionalInfo1": "info1", "ID": 1},
    {"DocText": input_text_list[1], "AdditionalInfo1": "info3", "ID": 2}
]

# Example additional database input data
database_input = [
    {"LabelText": label_text_list[0], "AdditionalInfo3": "info5", "code": 822431},
    {"LabelText": label_text_list[1], "AdditionalInfo3": "info7", "code": 982291},
    {"LabelText": label_text_list[2], "AdditionalInfo3": "info7", "code": 122235}
]

# Append extra column info
embedding_cross_detail_df = embedding_util.append_extra_column_info(
    input_data=input_input,
    input_text_col="DocText",
    db_input=database_input,
    db_text_col="LabelText",
    embedding_xdf=cosine_similarity_df
)

In [17]:
embedding_cross_detail_df.head()

Unnamed: 0,InputText,LabelText,score,DocText,AdditionalInfo1,ID,AdditionalInfo3,code
0,This is an example sentence.,This is an example sentence slightly modified.,0.822041,This is an example sentence.,info1,1.0,info5,822431.0
1,This is an example sentence.,This is an example sentence with minor edits but more than the one above.,0.715309,This is an example sentence.,info1,1.0,info7,982291.0
2,This is an example sentence.,Another example for demonstration.,0.518319,This is an example sentence.,info1,1.0,info7,122235.0
3,This is an example sentence.,I would like to make a smoothie.,0.045088,This is an example sentence.,info1,1.0,,
4,This is an example sentence.,I like to eat strawberries.,0.134639,This is an example sentence.,info1,1.0,,


#### Find the Highest Matching Pairs*
\* *First instance (there might be other with the same score)*

In [18]:
# Isolate highest cosine scoring embedding pair
top_embedding_cross_df = embedding_util.isolate_highest_cosine_scoring_embedding_pair(cosine_similarity_df)


In [19]:
top_embedding_cross_df

Unnamed: 0,InputText,LabelText,MaxScore
1,Another example for demonstration.,Another example for demonstration.,1.0
0,This is an example sentence.,This is an example sentence slightly modified.,0.822
2,Apples and bananas.,I like to eat strawberries.,0.451
