Notebook introduces the Embeddings and their use cases 

It all starts with Data, in this case the data is taken from the https://faq.ssa.gov/en-US/

Note, Embedding is a process of converting a word or a number into a vector of certain dimensions
Tokenizer and Embedding models are not same. They are different. 

Tokenizers are functions written in python that take a corpus of data and returns a dictionary-id map. Based on which the tokenizers, work on the sentences.

Embedding models are Neural Networks coded in Torch/TF/Jax/Flax that are used for creating vectors 

In [2]:
# we have to work with sentence transformers library. 

from sentence_transformers import SentenceTransformer, util

model_id = "sentence-transformers/all-MiniLM-L6-v2"
model_embedding = SentenceTransformer(model_id)

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [4]:
texts = ["How do I get a replacement Medicare card?",
        "What is the monthly premium for Medicare Part B?",
        "How do I terminate my Medicare Part B (medical insurance)?",
        "How do I sign up for Medicare?",
        "Can I sign up for Medicare Part B if I am working and have health insurance through an employer?",
        "How do I sign up for Medicare Part B if I already have Part A?",
        "What are Medicare late enrollment penalties?",
        "What is Medicare and who can get it?",
        "How can I get help with my Medicare Part A and Part B premiums?",
        "What are the different parts of Medicare?",
        "Will my Medicare premiums be higher because of my higher income?",
        "What is TRICARE ?",
        "Should I sign up for Medicare Part B if I have Veterans' Benefits?"]

In [11]:
model_embedding.to('cuda')

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [5]:
texts_tokenized = model_embedding.tokenize(texts=texts[0],)

In [6]:
texts_tokenized

{'input_ids': tensor([[ 101, 1044,  102],
         [ 101, 1051,  102],
         [ 101, 1059,  102],
         [ 101,  102,    0],
         [ 101, 1040,  102],
         [ 101, 1051,  102],
         [ 101,  102,    0],
         [ 101, 1045,  102],
         [ 101,  102,    0],
         [ 101, 1043,  102],
         [ 101, 1041,  102],
         [ 101, 1056,  102],
         [ 101,  102,    0],
         [ 101, 1037,  102],
         [ 101,  102,    0],
         [ 101, 1054,  102],
         [ 101, 1041,  102],
         [ 101, 1052,  102],
         [ 101, 1048,  102],
         [ 101, 1037,  102],
         [ 101, 1039,  102],
         [ 101, 1041,  102],
         [ 101, 1049,  102],
         [ 101, 1041,  102],
         [ 101, 1050,  102],
         [ 101, 1056,  102],
         [ 101,  102,    0],
         [ 101, 1049,  102],
         [ 101, 1041,  102],
         [ 101, 1040,  102],
         [ 101, 1045,  102],
         [ 101, 1039,  102],
         [ 101, 1037,  102],
         [ 101, 1054,  102],
 

In [12]:
texts_embed01 = model_embedding.encode(texts[0], convert_to_tensor=True)
texts_embed01.shape

torch.Size([384])

In [13]:
texts_embed02 = model_embedding.encode(texts[1], convert_to_tensor=True)
texts_embed02.shape

torch.Size([384])

In [14]:
from sentence_transformers.util import pytorch_cos_sim

In [15]:
similarity = pytorch_cos_sim(texts_embed01, texts_embed02)
similarity

tensor([[0.4886]], device='cuda:0')

In [16]:
embedding_texts = model_embedding.encode(texts)
embedding_texts.shape

(13, 384)

In [17]:
from pandas import DataFrame

embed_df = DataFrame(embedding_texts)

In [18]:
embed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,344,345,346,347,348,349,350,351,352,353,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383
0,-0.023889,0.055259,-0.011655,-0.033414,-0.012261,-0.024873,-0.012663,0.025346,0.018508,-0.083508,-0.09302,0.014486,-0.017411,-0.088344,-0.004479,-0.046326,-0.013194,0.035382,0.062311,0.04859,-0.059118,0.054135,-0.064397,0.034024,0.006636,0.035917,-0.067838,-0.017735,-0.012722,0.046462,0.108644,0.023821,-0.026996,0.037174,0.097598,-0.02703,-0.04543,0.031817,-0.033746,-0.015198,...,-0.045291,0.118322,0.054848,-0.040015,0.098105,0.022277,-0.030813,-0.005176,0.049103,0.045938,-0.023188,-0.027573,-0.040576,0.016116,0.02501,-0.058007,0.047965,0.117957,-0.008974,-0.013361,0.020989,-0.0252,-0.006896,-0.021131,0.005462,0.064137,0.026008,-0.02985,-0.011776,0.00309,-0.161688,-0.046426,0.006004,0.005281,-0.003342,0.027754,0.020411,0.005778,0.034098,-0.006889
1,-0.012688,0.046874,-0.010502,-0.020384,-0.013361,0.042322,0.016628,-0.004099,-0.002607,-0.010188,-0.044768,0.019365,0.031505,-0.118893,0.01985,0.035861,0.034993,-0.083673,0.056933,0.057396,-0.057795,-0.005447,0.003423,0.014473,0.146743,-0.053123,0.003083,0.030637,0.055512,0.043963,0.047002,0.044337,0.020708,-0.004741,-0.008704,-0.039581,-0.063424,-0.011725,-0.090585,-0.045387,...,-0.063684,0.099501,0.002105,0.042053,0.054385,-0.017293,-0.00745,0.034746,-0.000616,-0.050755,-0.040021,0.014303,0.025885,-0.062788,0.040704,-0.028741,0.069934,-0.024656,0.06453,0.014862,0.030004,-0.010374,-0.09046,-0.062121,-0.01513,-0.003932,0.075132,0.052699,0.020436,0.024714,-0.061594,-0.020717,-0.009082,-0.02926,-0.066253,0.065257,0.013229,-0.023103,-0.002785,0.010474
2,0.000494,0.119412,0.00523,-0.092734,0.007773,-0.005325,0.034506,-0.051981,-0.006265,-0.00611,-0.079471,0.036207,-0.00971,-0.081195,-0.001876,-0.013249,-0.042756,0.004501,-0.007266,0.100785,-0.002075,0.042169,-0.023942,0.098594,0.072433,-0.002734,0.016057,0.00572,-0.026609,-0.013365,0.097391,0.01028,-0.016172,-0.003942,0.034441,-0.013009,-0.10954,-0.019242,-0.003607,-0.060187,...,-0.015841,0.088835,-0.022281,0.007992,0.04476,-0.002664,-0.015018,-0.024615,0.043037,0.046402,-0.074185,0.007321,0.012401,-0.004225,0.040887,-0.013238,0.086007,0.130728,0.009953,0.053924,0.037271,-0.037933,-0.00412,-0.041604,-0.048431,0.110611,0.038085,-0.016102,-0.011424,-0.00941,-0.108326,-0.049646,-0.073399,-0.029898,-0.102734,0.062121,0.034605,0.016877,-0.023861,0.005264
3,-0.029711,0.023298,-0.057041,-0.012183,-0.01371,0.029796,0.063739,0.001101,-0.045124,-0.040748,-0.131671,0.000674,0.032849,-0.048718,-0.016917,-0.04001,-0.003435,-0.000405,0.049092,0.057811,0.007957,-0.01472,-0.055192,0.029432,0.086543,-0.034207,-0.004638,-0.006953,-0.017902,0.089433,0.138466,-0.004411,-0.012209,0.027505,0.056866,-0.016538,-0.03082,0.005954,-0.056146,-0.004276,...,-0.02335,0.117718,0.058016,0.007543,0.053195,0.029278,-0.005433,0.046559,-0.008911,-0.013223,-0.073022,-0.018384,-0.001908,-0.026813,0.075265,-0.090822,0.035911,0.121485,0.071004,-0.025873,-0.021903,0.062796,-0.012797,-0.006417,0.017931,0.035687,0.033231,0.021569,0.100695,-0.047331,-0.117682,0.031924,0.000854,0.0202,-0.020666,-0.005167,0.03837,0.003617,0.033993,-0.010255
4,-0.025628,0.070389,-0.01738,-0.056567,0.028576,0.052823,0.067063,-0.052617,-0.054702,-0.11623,-0.126143,0.038227,0.011085,-0.027623,0.086316,0.0057,0.013502,0.001248,0.03837,0.087459,-0.060004,0.007136,-0.052758,-0.003477,0.079192,-0.030614,0.03455,0.065704,-0.011732,0.051478,0.095803,-0.019129,-0.036677,0.015641,0.036194,-0.058811,-0.035086,0.022795,-0.081846,-0.027348,...,-0.075405,0.129256,-0.058059,-0.01965,0.10145,0.003209,-0.012665,0.038677,0.021085,-0.004969,-0.021644,-0.070017,0.060121,-0.107323,0.001019,-0.093465,0.087102,0.094227,0.080545,0.032137,-0.011176,-0.064559,-0.031923,-0.051013,-0.017872,0.017034,0.061883,0.052157,0.101039,-0.056417,-0.118145,0.013343,-0.055188,-0.032723,0.008436,0.019169,0.048212,-0.040412,0.083346,0.026855
5,-0.022656,0.02116,0.005105,-0.046494,0.009074,0.041495,0.054268,-0.024185,-0.013483,-0.075966,-0.090702,-0.029076,0.045339,-0.077989,0.047003,-0.01883,-0.031521,-0.022798,0.021713,0.057836,-0.051639,-0.014933,-0.029978,0.02325,0.087391,-0.062931,-0.00042,0.062464,-0.021476,0.035335,0.125799,0.029123,-0.037065,0.013791,0.057291,-0.072491,-0.044007,0.026902,-0.039566,-0.066453,...,-0.077669,0.099516,-0.011076,-0.007306,0.062561,0.006845,-0.005897,0.007084,0.010039,0.003088,-0.000738,-0.014339,-0.00231,-0.035318,0.033689,-0.050801,0.076678,0.09998,0.07201,0.044336,0.028311,0.001274,-0.067214,-0.064206,-0.031583,0.06006,0.076265,0.012245,0.071965,-0.010519,-0.10011,0.01075,-0.031469,-0.004822,0.039657,0.026384,0.045514,0.059089,-0.017509,0.007166
6,-0.002911,0.060791,-0.009176,-0.006133,0.040493,0.036594,0.002054,-0.031345,0.031806,-0.023495,0.071992,0.048723,0.081783,-0.050864,-0.005711,-0.080416,-0.01225,-0.003741,-0.029289,0.052237,-0.010236,0.037758,-0.079403,0.124539,0.091983,-0.010715,0.034181,-0.016364,-0.023802,0.015979,-0.060006,0.040025,-0.029828,0.017246,0.017604,-0.004945,-0.012642,0.005651,-0.064422,-0.001107,...,-0.037479,0.120514,0.092009,0.150646,0.05924,0.016865,-0.015192,0.032755,0.074319,0.0063,-0.098705,-0.016977,-0.04784,-0.077831,0.031058,-0.0236,0.030114,-0.007999,0.037392,-0.022385,0.026635,-0.019759,-0.097564,0.022126,-0.026906,-0.008749,-0.033806,0.028241,-0.001251,-0.003584,-0.028763,-0.060458,-0.018598,-0.040189,-0.031486,-0.018299,0.002286,-0.07342,0.016235,-0.000244
7,-0.080526,0.059888,-0.048847,-0.040176,-0.063342,0.041848,0.119045,0.010652,-0.030095,-0.004561,-0.07515,0.081693,0.003867,-0.084236,-0.0619,-0.02171,0.010616,-0.023371,0.03094,0.093385,-0.03637,0.04271,-0.061342,0.052395,0.041366,0.008109,-0.061988,-0.035993,-0.004243,0.071631,0.100317,0.0053,0.006457,0.049251,-0.039963,0.021823,-0.021824,0.033236,-0.022382,0.009573,...,-0.036822,0.103294,0.086007,0.000951,0.036378,0.036222,-0.036158,0.012988,0.004459,0.041679,-0.089641,-0.028039,0.027155,-0.080964,0.054563,-0.134662,0.005126,0.086044,0.044157,0.023074,-0.026023,-0.024532,-0.02176,-0.052582,0.015607,0.022571,0.046028,0.050643,0.054423,-0.083213,-0.144566,0.020404,0.023088,0.005077,-0.055645,-0.007675,0.050791,-0.005989,0.134562,0.034817
8,-0.034388,0.072501,0.01444,-0.036695,0.014019,0.06307,0.034683,-0.014531,-0.059862,-0.045383,-0.055213,-0.034528,0.00927,-0.095072,0.036745,0.025977,0.013696,0.004641,-0.044114,0.063383,-0.088903,0.013146,-0.03782,0.023436,0.079054,0.02817,-0.02684,0.012249,0.032541,-0.019416,0.079922,-0.04345,-0.04865,-0.00617,0.047211,-0.0036,-0.06654,0.031916,-0.052208,-0.04867,...,-0.026549,0.120332,-0.020662,-0.007842,0.052714,0.005838,-0.021314,-0.019987,0.016647,-0.036486,-0.018713,0.007056,0.013114,-0.034846,0.019419,-0.048089,0.070016,0.015946,0.055659,0.041075,0.049812,-0.037412,-0.01456,-0.032269,-0.040533,0.04331,0.072315,0.006942,0.030646,0.013022,-0.114763,-0.035894,-0.019877,-0.033375,-0.030168,0.039412,0.044993,0.000578,-0.025124,0.034191
9,-0.005964,0.025044,-0.003182,-0.025243,-0.039823,-0.012772,0.044713,0.014535,-0.038213,-0.041149,-0.05854,0.070492,-0.029789,-0.046087,-0.016301,-0.080821,0.030458,-0.014638,0.012796,0.120223,-0.032289,0.035957,-0.018771,0.06087,0.000829,0.037492,0.004634,0.005595,-0.000582,-0.020706,0.063955,0.027098,0.031915,0.017982,0.007558,0.045427,0.023558,0.037546,-0.043077,-0.012915,...,-0.012387,0.085,0.074588,0.018098,0.027723,0.073802,-0.010719,0.027924,0.027842,-0.001941,-0.052277,0.019475,0.04263,-0.044101,0.061573,-0.064164,0.077146,-0.030594,0.061598,0.050569,0.029921,-0.06405,-0.025672,0.022948,0.001914,-0.00496,0.032083,0.061701,0.011159,-0.078794,-0.057621,0.021594,0.048983,-0.044541,-0.030137,0.006779,0.054854,0.029937,0.070214,0.041565


In [19]:
embed_df['texts'] = texts

In [21]:
embed_df.to_csv("embed_text.csv",index=False)