In [24]:
from sentence_transformers import SentenceTransformer

# Load the model (will download if not already cached)
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

texts = ["How do I get a replacement Medicare card?",
        "What is the monthly premium for Medicare Part B?",
        "How do I terminate my Medicare Part B (medical insurance)?",
        "How do I sign up for Medicare?",
        "Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
        "How do I sign up for Medicare Part B if I already have Part A?",
        "What are Medicare late enrollment penalties?",
        "What is Medicare and who can get it?",
        "How can I get help with my Medicare Part A and Part B premiums?",
        "What are the different parts of Medicare?",
        "Will my Medicare premiums be higher because of my higher income?",
        "What is TRICARE ?",
        "Should I sign up for Medicare Part B if I have Veterans' Benefits?"]

# Generate embeddings
embeddings = model.encode(texts)

print(f"Generated embeddings for {len(texts)} texts")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Shape: {embeddings.shape}")

# Display first few dimensions of the first embedding
print(f"\nFirst embedding (first 10 dimensions): {embeddings[0][:10]}")

output = embeddings

Generated embeddings for 13 texts
Embedding dimension: 384
Shape: (13, 384)

First embedding (first 10 dimensions): [-0.02388943  0.05525852 -0.01165491 -0.03341427 -0.01226057 -0.02487277
 -0.01266338  0.02534589  0.01850851 -0.08350813]


In [23]:

import pandas as pd
df_embeddings = pd.DataFrame(output)
print(df_embeddings)

         0         1         2         3         4         5         6    \
0  -0.023889  0.055259 -0.011655 -0.033414 -0.012261 -0.024873 -0.012663   
1  -0.012688  0.046874 -0.010502 -0.020384 -0.013361  0.042322  0.016628   
2   0.000494  0.119412  0.005229 -0.092734  0.007773 -0.005325  0.034506   
3  -0.029711  0.023298 -0.057041 -0.012183 -0.013710  0.029796  0.063739   
4  -0.025628  0.070389 -0.017380 -0.056567  0.028576  0.052823  0.067063   
5  -0.022656  0.021160  0.005105 -0.046494  0.009074  0.041495  0.054268   
6  -0.002911  0.060791 -0.009176 -0.006133  0.040492  0.036594  0.002054   
7  -0.080526  0.059888 -0.048847 -0.040176 -0.063342  0.041848  0.119045   
8  -0.034388  0.072501  0.014440 -0.036695  0.014019  0.063070  0.034683   
9  -0.005964  0.025044 -0.003182 -0.025243 -0.039823 -0.012772  0.044713   
10 -0.039008 -0.010609 -0.007383 -0.050190 -0.002518 -0.041641  0.026969   
11 -0.095983 -0.063012 -0.116906 -0.059075 -0.051323 -0.003439  0.018687   
12 -0.011600

In [12]:
df_embeddings.to_csv("embeddings.csv", index=False)


In [None]:
import torch
from datasets import load_dataset
# Approach 1 - manually upload and fetch the embeddings from hg hub
source_embeddings = load_dataset('vector27/embeddingsFAQTest')
print(source_embeddings["train"][0])


embeddings.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/13 [00:00<?, ? examples/s]

{'0': -0.023889432, '1': 0.055258524, '2': -0.011654906, '3': -0.03341427, '4': -0.012260566, '5': -0.024872767, '6': -0.012663384, '7': 0.025345892, '8': 0.018508505, '9': -0.083508134, '10': -0.093019985, '11': 0.014486255, '12': -0.017410949, '13': -0.08834371, '14': -0.0044790884, '15': -0.04632591, '16': -0.013193871, '17': 0.03538181, '18': 0.062311143, '19': 0.048589665, '20': -0.05911841, '21': 0.054135375, '22': -0.06439692, '23': 0.03402402, '24': 0.0066363798, '25': 0.035917036, '26': -0.06783765, '27': -0.017735293, '28': -0.012721834, '29': 0.04646242, '30': 0.10864364, '31': 0.023821417, '32': -0.026996382, '33': 0.03717397, '34': 0.09759814, '35': -0.027030129, '36': -0.04542985, '37': 0.031817343, '38': -0.033746302, '39': -0.015198476, '40': -0.021535622, '41': 0.014811232, '42': -0.02089187, '43': 0.06885717, '44': 0.05017417, '45': -0.024727616, '46': -0.06276789, '47': 0.04828718, '48': 0.082910255, '49': 0.0792205, '50': -0.042626597, '51': -0.04397193, '52': -0.01

In [26]:
dataset_embeddings = torch.from_numpy(source_embeddings["train"].to_pandas().to_numpy()).to(torch.float)

  dataset_embeddings = torch.from_numpy(source_embeddings["train"].to_pandas().to_numpy()).to(torch.float)


IndexError: too many indices for tensor of dimension 2

In [None]:
import numpy as np
# Approach 2 - embeddings from model can be used directly and converted into tensors
source_embeddings = torch.from_numpy(embeddings).to(torch.float)                                                                                           
print(f"Local embeddings shape: {source_embeddings.shape}")                                                                                 
print(f"First embedding (first 10 dimensions): {source_embeddings[0][:10]}") 

Local embeddings shape: torch.Size([13, 384])
First embedding (first 10 dimensions): tensor([-0.0239,  0.0553, -0.0117, -0.0334, -0.0123, -0.0249, -0.0127,  0.0253,
         0.0185, -0.0835])


In [16]:
query = ['How can medicare help me with my health?']
query_embedding = model.encode(query)
tensor_query = torch.FloatTensor(query_embedding)

In [27]:
from sentence_transformers.util import semantic_search
matches = semantic_search(tensor_query, source_embeddings, top_k=5)
print(matches)

[[{'corpus_id': 7, 'score': 0.7293230891227722}, {'corpus_id': 8, 'score': 0.704310953617096}, {'corpus_id': 3, 'score': 0.6988481283187866}, {'corpus_id': 9, 'score': 0.6667883396148682}, {'corpus_id': 10, 'score': 0.6479672193527222}]]


In [28]:
for i in range(len(matches[0])):
    print(texts[matches[0][i]['corpus_id']])

What is Medicare and who can get it?
How can I get help with my Medicare Part A and Part B premiums?
How do I sign up for Medicare?
What are the different parts of Medicare?
Will my Medicare premiums be higher because of my higher income?
