# Parsbert 

### Libraries

In [1]:
# !pip install transformers
# ! pip install nltk
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# ! wget https://raw.githubusercontent.com/language-ml/2-LM-embedding-projects/main/problem3/evaluation_IR.yml -P ./data
# ! wget https://github.com/language-ml/2-LM-embedding-projects/raw/main/problem3/doc_collection.zip -P ./data
# ! unzip ./data/doc_collection.zip -d ./data

### Read data

set path of data in `PATH` variable

In [3]:
PATH = './data/IR_dataset/'
PATH = PATH.rstrip('/')

store txt files into a list named `doc`

In [4]:
from nltk.tokenize import sent_tokenize

docs = []
for index in range(0, 3258):
    with open(f"{PATH}/{index}.txt", 'r', encoding='utf8') as file_reader:
      doc = file_reader.read()
      doc = sent_tokenize(doc)
      docs.append(doc)

In [5]:
docs = [sent for doc in docs for sent in doc]

In [6]:
len(docs)

67319

### Check GPU Availability

In [7]:
import torch
torch.cuda.is_available()

True

### Implementation

In [8]:
from transformers import AutoConfig, AutoTokenizer, BertForQuestionAnswering

Model = "HooshvareLab/bert-base-parsbert-uncased"
config = AutoConfig.from_pretrained(Model)
tokenizer = AutoTokenizer.from_pretrained(Model)
model = BertForQuestionAnswering.from_pretrained(Model)

Some weights of the model checkpoint at HooshvareLab/bert-base-parsbert-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initia

In [9]:
print("Sentence embeddings:")
print(model.parameters())

Sentence embeddings:
<generator object Module.parameters at 0x7fbb601ebc50>


In [10]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [11]:
out_docs = []
for sent in docs:
  if len(sent.split())>511:
    continue
  out_docs.append(sent)

In [12]:
tokens = tokenizer(sent, return_tensors='pt', padding=True, truncation=True)

# contextualized embedding
with torch.no_grad():
    output_model = model(**tokens)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [14]:
output_model

QuestionAnsweringModelOutput([('start_logits',
                               tensor([[-0.8131,  0.3454, -0.0252, -0.0262,  0.3414,  0.1872,  1.0214,  0.7793,
                                         1.2127,  0.5870,  0.0679, -0.2488,  0.1204,  0.4581,  0.2372,  0.1739,
                                        -0.0944, -0.1171]])),
                              ('end_logits',
                               tensor([[-0.5771, -1.1046, -0.1109,  0.0591,  0.0641,  0.2661, -0.3577,  0.7499,
                                        -0.4978, -1.4171, -0.9818, -0.3441, -0.8393,  0.1422,  0.4629,  0.0050,
                                         0.0696,  0.0614]]))])

In [13]:
# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(output_model, tokens['attention_mask'])

RuntimeError: ignored

In [None]:
from transformers import pipeline

fill = pipeline('fill-mask', model=Model, tokenizer=Model)
results = fill('تهران پایتخت [MASK] است.')
print(results[0]['token_str'])

In [None]:
print("Sentence embeddings:")
print(sentence_embeddings.shape)

In [None]:
docs[:10]

In [None]:
bag = list(set([item for sentence in docs for item in sentence.split(' ') if item != '']))
bag_size = len(bag)
bag_size

In [None]:
import random

sentence_a = []
sentence_b = []
label = []

for sentence in docs:
    num_sentences = len(sentence)
    if num_sentences > 1:
        start = random.randint(0, num_sentences-2)
        # 50/50 whether is IsNextSentence or NotNextSentence
        if random.random() >= 0.5:
            # this is IsNextSentence
            sentence_a.append(sentence[start])
            sentence_b.append(sentence[start+1])
            label.append(0)
        else:
            index = random.randint(0, bag_size-1)
            # this is NotNextSentence
            sentence_a.append(sentence[start])
            sentence_b.append(bag[index])
            label.append(1)

In [None]:
inputs = tokenizer(sentence_a, sentence_b, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [None]:
inputs.keys()

In [None]:
inputs['labels'] = torch.LongTensor([label]).T

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# and move our model over to the selected device
model.to(device)

In [None]:
from transformers import AdamW

# activate training mode
model.train()
# initialize optimizer
optim = AdamW(model.parameters(), lr=5e-6)

In [None]:
class MeditationsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = MeditationsDataset(inputs)

In [None]:
torch.cuda.empty_cache()
loader = torch.utils.data.DataLoader(dataset, batch_size=8, shuffle=True)

In [None]:
from tqdm import tqdm  # for our progress bar

epochs = 10

for epoch in range(epochs):
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        print(input_ids.shape)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))