# Parsbert 

### Libraries

In [3]:
# !pip install transformers
# ! pip install nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# ! wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml -P ./data
# ! wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip -P ./data
# ! unzip ./data/doc_collection.zip -d ./data

### Read data

set path of data in `PATH` variable

In [5]:
PATH = './data/IR_dataset/'
PATH = PATH.rstrip('/')

store txt files into a list named `doc`

In [25]:
from nltk.tokenize import sent_tokenize

docs = []
for index in range(0, 3258):
    with open(f"{PATH}/{index}.txt", 'r', encoding='utf8') as file_reader:
      doc = file_reader.read()
      # doc = sent_tokenize(doc)
      docs.append(doc)

In [26]:
# docs = [sent for doc in docs for sent in doc]

In [27]:
len(docs)

3258

### Check GPU Availability

In [9]:
import torch
torch.cuda.is_available()

True

### Implementation

In [10]:
from transformers import AutoConfig, AutoTokenizer, AutoModel

Model = 'm3hrdadfi/bert-zwnj-wnli-mean-tokens'
tokenizer = AutoTokenizer.from_pretrained(Model)
model = AutoModel.from_pretrained(Model)


Downloading:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624M [00:00<?, ?B/s]

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initia

In [11]:
print("Sentence embeddings:")
print(model.parameters())

Sentence embeddings:
<generator object Module.parameters at 0x7f217c41e750>


In [None]:
# Max Pooling - Take the max value over time for every dimension. 
def max_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
    return torch.mean(token_embeddings, 1)[0]

In [12]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [13]:
out_docs = []
for sent in docs:
  if len(sent.split())>511:
    continue
  out_docs.append(sent)

In [14]:
tokens = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)

# contextualized embedding
with torch.no_grad():
    output_model = model(**tokens)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [15]:
output_model

QuestionAnsweringModelOutput([('start_logits',
                               tensor([[ 0.0557, -0.0551, -0.5949, -0.2008,  0.1399, -0.5041, -0.1848,  0.7521,
                                         0.1791,  0.1210,  0.2476,  0.3125,  0.4015, -0.1153,  0.1680, -0.7147,
                                        -0.5307, -0.5388]])),
                              ('end_logits',
                               tensor([[-0.2236, -0.3130, -0.9328, -0.9938, -0.2587, -0.3465, -0.3260, -0.8135,
                                        -1.0693, -0.0858, -0.2736, -0.3655, -0.2351, -0.2868,  0.4004, -0.6402,
                                        -0.4271, -0.4745]]))])

In [None]:
# Perform pooling. In this case, mean pooling.
sentence_embeddings = max_pooling(output_model, tokens['attention_mask'])

RuntimeError: ignored

In [None]:
from transformers import pipeline

fill = pipeline('fill-mask', model=Model, tokenizer=Model)
results = fill('تهران پایتخت [MASK] است.')
print(results[0]['token_str'])

In [None]:
print("Sentence embeddings:")
print(sentence_embeddings.shape)

In [None]:
docs[:10]

In [None]:
bag = list(set([item for sentence in docs for item in sentence.split(' ') if item != '']))
bag_size = len(bag)
bag_size

In [None]:
import random

sentence_a = []
sentence_b = []
label = []

for sentence in docs:
    num_sentences = len(sentence)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentence[start])
            sentence_b.append(sentence[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentence[start])
            sentence_b.append(bag[index])
            label.append(1)

In [None]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [None]:
inputs.keys()

In [None]:
inputs['labels'] = torch.LongTensor([label]).T

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)

In [None]:
torch.cuda.empty_cache()
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 10

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        print(input_ids.shape)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [16]:
top_k = 5

In [18]:
!pip install -qU sentence-transformers

[K     |████████████████████████████████| 78 kB 2.9 MB/s 
[K     |████████████████████████████████| 1.2 MB 11.3 MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone


In [21]:
from sentence_transformers import models, SentenceTransformer, util

In [22]:
def load_st_model(model_name_or_path):
    word_embedding_model = models.Transformer(model_name_or_path)
    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)
    
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    return model

In [29]:
# Load the Sentence-Transformer
embedder = load_st_model('m3hrdadfi/bert-zwnj-wnli-mean-tokens')

In [30]:
corpus_embeddings = embedder.encode(docs, convert_to_tensor=True, show_progress_bar=True)

Batches:   0%|          | 0/102 [00:00<?, ?it/s]

In [34]:
queries = [
  'آدولف هیتلر  شکست و مرگ'
]

In [35]:
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True, show_progress_bar=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use torch.topk to find the highest 5 scores
    top_results = torch.topk(cos_scores, k=top_k)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
top_results

torch.return_types.topk(values=tensor([0.8791, 0.8606, 0.8437, 0.8388, 0.8377]), indices=tensor([351, 343, 482, 342, 345]))