In [1]:
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [4]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

In [5]:
texts = ["Guidelines for PM-Surya Ghar: Muft Bijli Yojana Central Financial Assistance to Residential Consumers Annexure 2 Model Draft Agreement between Consumer & Vendor for installation of grid connected rooftop solar (RTS) project under PM – Surya Ghar: Muft Bijli Yojana", "Model Draft Agreement between Consumer & Vendor for installation of grid connected rooftop solar (RTS) project under PM – Surya Ghar: Muft Bijli Yojana This agreement is executed on -------(Day)------(Month)-------(Year) for design, supply, installation, commissioning and 5 -year comprehensive maintenance of RTS project/system along with warranty under PM Surya Ghar: Muft Bijli Yojana"]

In [31]:
encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    return_tensors="pt"
)

In [7]:
encoded

{'input_ids': tensor([[  101, 11594,  2005,  7610,  1011,  7505,  3148,  1043,  8167,  1024,
         14163,  6199, 12170,  3501,  3669, 10930, 18803,  2430,  3361,  5375,
          2000,  5647, 10390, 17827,  5397,  1016,  2944,  4433,  3820,  2090,
          7325,  1004, 21431,  2005,  8272,  1997,  8370,  4198, 23308,  5943,
          1006, 19387,  2015,  1007,  2622,  2104,  7610,  1516,  7505,  3148,
          1043,  8167,  1024, 14163,  6199, 12170,  3501,  3669, 10930, 18803,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2944,  4433,  3820,  2090,  7325,  1004, 21431,  2005,  8272,
          1997,  8370,  4198, 23308,  5943,  1006, 

In [32]:
encoded = {k: v.to(device) for k, v in encoded.items()}

In [None]:
for k, v in encoded.items():
    print(f"K= {k}\tv= {v}") 

K= input_ids	v= tensor([[  101, 11594,  2005,  7610,  1011,  7505,  3148,  1043,  8167,  1024,
         14163,  6199, 12170,  3501,  3669, 10930, 18803,  2430,  3361,  5375,
          2000,  5647, 10390, 17827,  5397,  1016,  2944,  4433,  3820,  2090,
          7325,  1004, 21431,  2005,  8272,  1997,  8370,  4198, 23308,  5943,
          1006, 19387,  2015,  1007,  2622,  2104,  7610,  1516,  7505,  3148,
          1043,  8167,  1024, 14163,  6199, 12170,  3501,  3669, 10930, 18803,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2944,  4433,  3820,  2090,  7325,  1004, 21431,  2005,  8272,
          1997,  8370,  4198, 23308,  5943,  1006

In [33]:
with torch.no_grad():
    model_output = model(**encoded)
model_output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 5.0849e-02,  4.6682e-01,  3.4965e-02,  ..., -1.7503e-02,
          -6.1857e-01, -7.8995e-04],
         [ 2.0915e-01,  2.4021e-01, -1.1188e-01,  ...,  8.3694e-01,
          -1.2226e+00,  7.9146e-02],
         [-2.3448e-01,  3.2407e-01, -1.1586e-01,  ..., -6.6054e-02,
          -9.1410e-01,  3.7478e-01],
         ...,
         [-5.9528e-02, -1.0862e-01,  1.4144e-01,  ...,  1.1799e-01,
          -4.5429e-01,  1.3107e-01],
         [-5.5251e-02, -1.0866e-01,  2.0537e-01,  ...,  1.3604e-01,
          -4.9495e-01,  1.7644e-01],
         [-3.2592e-02, -1.1134e-01,  2.2980e-01,  ...,  1.2167e-01,
          -4.8687e-01,  2.0339e-01]],

        [[-1.4590e-01,  4.8120e-01, -1.3290e-01,  ...,  1.2646e-01,
          -5.1550e-01, -8.9822e-02],
         [-1.3510e+00, -9.3279e-01,  1.6952e-01,  ..., -4.4607e-01,
          -5.6073e-01,  1.7208e-01],
         [-1.0723e-01,  4.7482e-01, -5.5054e-01,  ..., -2.0620e-01,
          -9.

In [34]:
token_embeddings = model_output.last_hidden_state
token_embeddings.shape

torch.Size([2, 108, 384])

In [26]:
token_embeddings[0]

tensor([[ 5.0849e-02,  4.6682e-01,  3.4965e-02,  ..., -1.7503e-02,
         -6.1857e-01, -7.8995e-04],
        [ 2.0915e-01,  2.4021e-01, -1.1188e-01,  ...,  8.3694e-01,
         -1.2226e+00,  7.9146e-02],
        [-2.3448e-01,  3.2407e-01, -1.1586e-01,  ..., -6.6054e-02,
         -9.1410e-01,  3.7478e-01],
        ...,
        [-5.9528e-02, -1.0862e-01,  1.4144e-01,  ...,  1.1799e-01,
         -4.5429e-01,  1.3107e-01],
        [-5.5251e-02, -1.0866e-01,  2.0537e-01,  ...,  1.3604e-01,
         -4.9495e-01,  1.7644e-01],
        [-3.2592e-02, -1.1134e-01,  2.2980e-01,  ...,  1.2167e-01,
         -4.8687e-01,  2.0339e-01]], device='mps:0')

In [35]:
attention_mask = encoded["attention_mask"].unsqueeze(-1)
attention_mask.shape

torch.Size([2, 108, 1])

In [37]:
embeddings = (token_embeddings * attention_mask).sum(dim=1)
embeddings

tensor([[-4.9749e-02,  1.0749e+01,  2.0887e+00, -1.2629e+00,  1.4339e+00,
         -4.0034e+00,  3.2769e+00,  6.7968e+00, -9.2159e+00, -2.8835e+00,
          6.1804e+00, -1.7985e+00,  5.4182e+00,  3.5218e+00,  1.3250e+01,
          7.9383e+00,  1.1041e+01, -3.7302e+00,  7.5643e+00,  7.4031e+00,
          6.2734e+00, -1.0979e+01, -6.9708e+00, -1.1484e+01,  4.6534e+00,
         -3.6134e-01,  1.1643e+00,  5.9481e+00,  5.1336e-02,  9.8654e+00,
         -1.3589e+00,  1.5164e+01,  2.4749e+00, -3.4483e-01,  7.2589e+00,
          1.5264e+01, -6.3676e+00, -1.4482e+00, -1.3299e+00, -8.5861e+00,
          3.0847e-01, -1.2648e+01, -2.6365e+00, -1.2402e+01,  6.5210e+00,
         -7.7899e+00, -3.0830e+00,  6.3381e+00, -5.8505e+00,  5.4251e+00,
          3.3568e+00, -4.0407e+00,  6.3824e-01,  4.6530e+00,  1.7273e+00,
         -5.4925e+00, -1.2215e+01, -1.4519e+00,  4.3434e+00, -4.9172e+00,
         -7.5749e+00,  8.3233e-01, -1.7214e+01, -1.5015e+00,  8.4319e+00,
         -8.7537e+00, -6.8238e+00,  8.

In [39]:
print(attention_mask.sum(dim=1))

tensor([[ 61],
        [108]], device='mps:0')


In [38]:
embeddings = embeddings / attention_mask.sum(dim=1)
embeddings

tensor([[-8.1556e-04,  1.7621e-01,  3.4241e-02, -2.0703e-02,  2.3507e-02,
         -6.5629e-02,  5.3720e-02,  1.1142e-01, -1.5108e-01, -4.7270e-02,
          1.0132e-01, -2.9484e-02,  8.8823e-02,  5.7734e-02,  2.1721e-01,
          1.3014e-01,  1.8099e-01, -6.1150e-02,  1.2401e-01,  1.2136e-01,
          1.0284e-01, -1.7998e-01, -1.1427e-01, -1.8827e-01,  7.6285e-02,
         -5.9237e-03,  1.9086e-02,  9.7511e-02,  8.4157e-04,  1.6173e-01,
         -2.2277e-02,  2.4860e-01,  4.0572e-02, -5.6530e-03,  1.1900e-01,
          2.5023e-01, -1.0439e-01, -2.3742e-02, -2.1802e-02, -1.4076e-01,
          5.0569e-03, -2.0735e-01, -4.3221e-02, -2.0331e-01,  1.0690e-01,
         -1.2770e-01, -5.0541e-02,  1.0390e-01, -9.5910e-02,  8.8936e-02,
          5.5030e-02, -6.6241e-02,  1.0463e-02,  7.6279e-02,  2.8317e-02,
         -9.0041e-02, -2.0024e-01, -2.3802e-02,  7.1203e-02, -8.0610e-02,
         -1.2418e-01,  1.3645e-02, -2.8220e-01, -2.4614e-02,  1.3823e-01,
         -1.4350e-01, -1.1187e-01,  1.