Vardas, pavardė: Mėta Bambalaitė

LSP: 1813061

Antra užduotis, variantas T = 2

*   Modelis: Albert

Based on: https://huggingface.co/transformers/model_doc/albert.html

Mount to google drive

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Access google drive folder

In [6]:
%cd /content/gdrive/MyDrive/GMM-second

/content/gdrive/MyDrive/GMM-second


Import libraries and dependencies

In [9]:
import json
from pathlib import Path

!pip install sentencepiece
!pip install transformers

from transformers import AlbertTokenizer, AlbertModel
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler, pipeline
import torch
from tqdm.auto import tqdm



Download Squad v2.0 datasets

In [None]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O train-v2.json
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O dev-v2.json

Run model on Squad v2



In [7]:
# Read data from json file
def readData(path):
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    contexts = []
    questions = []
    answers = []

    for group in squad_dict['data']: # navigating through groups
        for passage in group['paragraphs']: # then through paragraphs
            context = passage['context'] # getting the contectex message
            for qa in passage['qas']: # geting the question and answer
                question = qa['question'] # collecting questions
                for answer in qa['answers']: # collecting answers
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
   
    return contexts, questions, answers #returning the formed text

# Get character position at which the answer ends in the passage
def addEndIndexes(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1   # off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2   # off by two characters


# Read data from file
train_contexts, train_questions, train_answers = readData('train-v2.json')
val_contexts, val_questions, val_answers = readData('dev-v2.json')

# Include character end positions and the corrected start positions to answers
addEndIndexes(train_answers, train_contexts)
addEndIndexes(val_answers, val_contexts)

# Tokenizing context/question pairs
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')#AutoTokenizer.from_pretrained('twmkn9/albert-base-v2-squad2')
model = AutoModelForQuestionAnswering.from_pretrained('albert-base-v2')#AutoModelForQuestionAnswering.from_pretrained('twmkn9/albert-base-v2-squad2')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

# Convert character start/end positions to token start/end positions.
def addTokenPositions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    # where the answer and context start and end
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

addTokenPositions(train_encodings, train_answers)
addTokenPositions(val_encodings, val_answers)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1312669.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=47376696.0, style=ProgressStyle(descrip…




Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForQuestionAnswering: ['predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN t

Test tokenization

In [117]:
def show_answer(idx):
    print("Tokenized:", tokenizer.decode(train_encodings['input_ids'][idx][train_encodings['start_positions'][idx]: train_encodings['end_positions'][idx]]))
    print("Real:", train_answers[idx]['text'])
    print("Context:", train_contexts[idx])
    print("Questions:", train_questions[idx])

In [123]:
show_answer(7)

Tokenized: mathew know
Real: Mathew Knowles
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Questions: Who managed the Destiny's Child group?


Train model

In [18]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

# use cuda if possible
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

# load model training parameters
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
# use standart Adam optimizer
optim = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optim, num_warmup_steps=0, num_training_steps=num_training_steps)
#Adding the progress bar.
progress_bar = tqdm(range(num_training_steps))

x = 0;

for epoch in range(2):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        progress_bar.update(1)
        if x>10000:
          break;
        x+=1;

model.eval()

model.save_pretrained("/content/gdrive/MyDrive/GMM-second")


HBox(children=(FloatProgress(value=0.0, max=130233.0), HTML(value='')))

Retrained model evaluation

In [19]:
model_pret = AutoModelForQuestionAnswering.from_pretrained('/content/gdrive/MyDrive/GMM-second')

tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')

nlp = pipeline('question-answering', model=model, tokenizer=tokenizer,device=0)
QA_input = {
    'question': 'Whats the largest city?',
    'context' : 'New Zealand (Māori: Aotearoa) is a sovereign island country in the southwestern Pacific Ocean. It has a total land area of 268,000 square kilometres (103,500 sq mi), and a population of 4.9 million. New Zealands capital city is Wellington, and its most populous city is Auckland.'
}

result = nlp(QA_input,topk = 3)
print(result)

[{'score': 0.4171760380268097, 'start': 270, 'end': 279, 'answer': 'Auckland.'}, {'score': 0.16530218720436096, 'start': 228, 'end': 279, 'answer': 'Wellington, and its most populous city is Auckland.'}, {'score': 0.0884709283709526, 'start': 228, 'end': 239, 'answer': 'Wellington,'}]


Validating with validation file

In [20]:
# read validation file
validationFile = open("task_2_test_set_questions.txt", "r")
data = validationFile.read()

# split list and format it
splittedToArray = data.split("\n")
splittedToArray = splittedToArray[:-1]
formattedList = []

for line in splittedToArray:
    context = line.split(",")[0]
    context.strip()
    question = line.split(",")[1]
    question.strip()
    formattedList.append({"question":question,"context":context})

nlp = pipeline('question-answering', model=model, tokenizer=tokenizer,device=0)
QA_input = formattedList

rez = nlp(QA_input,topk = 1)

validationFile = open("studentid.csv", "w")

resultText = ""

for value in rez:
    resultText += value["answer"] + "\n"

validationFile.write(resultText)
validationFile.close()