In [23]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("mwz/UrduBert")
model = AutoModel.from_pretrained("mwz/UrduBert")

def get_embedding(text, pooling_type="cls"):
    # Tokenize the input text and convert to PyTorch tensors
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Forward pass through the model to get the outputs (hidden states)
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(**inputs)

    # Extract the last hidden state (embeddings)
    last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_size]

    # Choose embedding type based on pooling_type
    if pooling_type == "cls":
        # Take the [CLS] token embedding
        cls_embedding = last_hidden_state[:, 0, :]  # Shape: [batch_size, hidden_size]
        return cls_embedding.squeeze(0)  # Return a tensor of shape [hidden_size]
    
    elif pooling_type == "mean":
        # Mean-pool the token embeddings (excluding padding tokens)
        attention_mask = inputs['attention_mask']
        mean_pooling = torch.sum(last_hidden_state * attention_mask.unsqueeze(-1), dim=1) / torch.sum(attention_mask.unsqueeze(-1), dim=1)
        return mean_pooling.squeeze(0)  # Return a tensor of shape [hidden_size]

    else:
        raise ValueError("pooling_type must be either 'cls' or 'mean'.")

Some weights of RobertaModel were not initialized from the model checkpoint at mwz/UrduBert and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
mean_pooling

tensor([[-0.1206,  0.5497,  0.6291, -0.6541, -0.1881, -0.6269, -0.0763,  0.7037,
          0.8596, -0.1699,  0.1436,  0.6277, -0.7546, -0.7827,  0.1517, -0.6952,
         -1.4332, -0.6462,  0.3964, -0.2134,  1.0889, -0.6394,  1.2095,  1.6724,
          0.7817, -0.4934, -0.1990, -0.2254, -0.1559,  0.5399, -0.6145, -0.5507,
          0.2027,  1.1732,  0.4317,  0.7345, -0.8598, -0.8404,  0.4689,  0.4179,
         -1.0180, -0.3925,  0.2917, -0.5717, -0.7609,  0.5619, -0.5860,  0.0342,
         -0.6626,  1.1468,  0.7482,  1.1459, -0.1212,  0.0868,  0.2955, -0.6337,
         -0.0282,  0.6349,  0.2171, -0.1347, -0.4041,  0.1823, -0.1671,  0.3206,
         -0.8347, -1.7990,  0.8173, -0.7448, -0.1045, -0.1358,  0.2773,  0.0843,
          0.8245,  0.8428, -0.8805, -0.6240,  0.3297,  0.6343, -0.4761,  0.8343,
          0.6483, -0.4176, -0.1682,  0.9472, -0.0649,  0.2426,  0.6476,  0.2449,
          0.0163, -0.3859,  0.0501,  0.2681,  0.2552,  0.0155,  0.5071, -0.6122,
         -0.9226, -0.4496, -

In [4]:
import anthropic
from dotenv import load_dotenv
from datasets import load_from_disk
import pandas as pd

load_dotenv()

True

In [5]:
dataset_dict = load_from_disk("../../data/complete_dataset/")

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 495
    })
    validation: Dataset({
        features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
        num_rows: 124
    })
})


In [10]:
dataset_dict['train']

Dataset({
    features: ['question_eng', 'question_urdu', 'context_eng', 'context_urdu', 'answer_eng', 'answer_urdu', 'context_index', '__index_level_0__'],
    num_rows: 495
})

In [15]:
dataset_dict['validation']['context_urdu']
dataset_dict['validation']['context_index']

[253,
 271,
 209,
 266,
 67,
 167,
 265,
 13,
 229,
 255,
 275,
 50,
 207,
 218,
 125,
 79,
 106,
 257,
 99,
 163,
 45,
 284,
 276,
 242,
 23,
 171,
 137,
 30,
 117,
 187,
 166,
 305,
 71,
 22,
 147,
 62,
 66,
 193,
 237,
 152,
 282,
 101,
 252,
 172,
 85,
 285,
 182,
 232,
 182,
 83,
 74,
 42,
 177,
 190,
 160,
 215,
 80,
 254,
 231,
 219,
 304,
 227,
 214,
 246,
 294,
 98,
 172,
 161,
 168,
 306,
 65,
 226,
 186,
 291,
 104,
 19,
 111,
 226,
 274,
 115,
 14,
 0,
 84,
 99,
 79,
 179,
 183,
 284,
 251,
 148,
 134,
 92,
 188,
 259,
 65,
 157,
 4,
 208,
 222,
 220,
 109,
 121,
 264,
 302,
 62,
 71,
 150,
 250,
 166,
 234,
 128,
 6,
 195,
 201,
 153,
 257,
 154,
 59,
 300,
 159,
 15,
 64,
 77,
 189]

In [18]:
all_contexts = {}

for i in dataset_dict['train']:
    all_contexts[i['context_index']] = i['context_urdu']


for i in dataset_dict['validation']:
    all_contexts[i['context_index']] = i['context_urdu']

In [21]:
sorted(all_contexts.keys())

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
