<a href="https://colab.research.google.com/github/oguzhanolm/Turkish_Bert_Question_Answering_Fine-Tuning/blob/main/Turkish_QA_case.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer

## Dowloand and preprocessing dataset

### Flatting the dataset

In [None]:
test_data = pd.read_json('https://raw.githubusercontent.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset/master/data/2018%20%2B%202020%20veri%20k%C3%BCmesi/final_dev_data_v2.json')
train_data= pd.read_json('https://raw.githubusercontent.com/okanvk/Turkish-Reading-Comprehension-Question-Answering-Dataset/master/data/2018%20%2B%202020%20veri%20k%C3%BCmesi/final_train_data_v2.json')

In [None]:
del test_data["version"]
del train_data["version"]

In [None]:
def flat_json(jsonData):
  comp_list = []
  cols = ["text","question","answer","answer_start"]
  for i in jsonData["data"]:
    for j in i['paragraphs']:
      for k in j["qas"]:
        temp_list = []
        temp_list.append(j["context"])
        temp_list.append(k["question"])
        temp_list.append(k["answers"][0]["text"])
        temp_list.append(k["answers"][0]["answer_start"])
        comp_list.append(temp_list)
  temp_df = pd.DataFrame(comp_list, columns=cols) 
  return pd.DataFrame(comp_list, columns=cols)

In [None]:
test_data = flat_json(test_data)
train_data = flat_json(train_data)

In [None]:
test_data

### Labeling the start and end position of the answer in the text

In [None]:
from warnings import catch_warnings
def add_end_idx(dataset):
  temp_list = []
  for idx,data in test_data.iterrows():
    gold_text = data["answer"]
    start_idx = data['answer_start']
    end_idx = start_idx + len(gold_text)

    if data["text"][start_idx:end_idx] == gold_text:
      data["answer_end"] = end_idx
    else:
      for n in [1, 2]:
        if data["text"][start_idx-n:end_idx-n] == gold_text:
          data['answer_start'] = start_idx - n
          data['answer_end'] = end_idx - n
    temp_list.append(data)
  return pd.DataFrame(temp_list)

In [None]:
test_data = add_end_idx(test_data)
train_data = add_end_idx(train_data)

In [None]:
test_data

Unnamed: 0,text,question,answer,answer_start,answer_end
0,Osman Bey 1258 yılında Söğüt’te doğdu. Osman B...,Osman Bey ne zaman doğmuştur?,1258,10,14
1,Osman Bey 1258 yılında Söğüt’te doğdu. Osman B...,Osman Bey nerede doğmuştur?,Söğüt’te,23,31
2,Osman Bey 1258 yılında Söğüt’te doğdu. Osman B...,1258 yılında kim doğmuştur?,Osman Bey,0,9
3,Osman Bey 1258 yılında Söğüt’te doğdu. Osman B...,Osman Bey hayatını nerede kaybetmiştir?,Bursa’da,67,75
4,Osman Bey 1258 yılında Söğüt’te doğdu. Osman B...,Osman Bey hayatını ne zaman kaybetmiştir?,1326’da,59,66
...,...,...,...,...,...
1325,"Sencer Divitçioğlu, 14 Şubat 1927'de İstanbul’...",Sencer Divitçioğlu nerede 1968-1969'da tekrar ...,Cambridge'de,924,936
1326,"Sencer Divitçioğlu, 14 Şubat 1927'de İstanbul’...",Sencer Divitçioğlu ne zaman Boğaziçi Üniversit...,1975'te,971,978
1327,"Sencer Divitçioğlu, 14 Şubat 1927'de İstanbul’...",Sencer Divitçioğlu ne zaman profesörlüğe yükse...,1976 yılında,1029,1041
1328,"Sencer Divitçioğlu, 14 Şubat 1927'de İstanbul’...",Sencer Divitçioğlu ne zaman Paris Üniversites...,1983 yılında,1065,1077


### Separating training and testing data

In [None]:
test_questions = [q.lstrip() for q in test_data["question"]]
test_text = [q.lstrip() for q in test_data["text"]]

train_questions = [q.lstrip() for q in train_data["question"]]
train_text = [q.lstrip() for q in train_data["text"]]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("loodos/bert-base-turkish-uncased", do_lower_case=False)
model = AutoModelForQuestionAnswering.from_pretrained("loodos/bert-base-turkish-uncased")

Downloading:   0%|          | 0.00/161 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/260k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at loodos/bert-base-turkish-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized f

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
test_encodings = tokenizer(test_text, test_questions, truncation=True, padding=True)
train_encodings =  tokenizer(train_text, train_questions, truncation=True, padding=True)

In [None]:
def add_token_positions(encodings, answers):
    
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers['answer_start'][i]))
        end_positions.append(encodings.char_to_token(i, answers['answer_end'][i]))
       
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers['answer_end'][i]-go_back)
            go_back +=1
    
    return start_positions,end_positions

In [None]:
train_encodings["start_positions"],train_encodings["end_positions"] = add_token_positions(train_encodings, train_data[["answer","answer_start","answer_end"]])
test_encodings["start_positions"],test_encodings["end_positions"] = add_token_positions(test_encodings,test_data[["answer","answer_start","answer_end"]])

## Pytorch Fine-tuning and Eval


In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = Dataset(train_encodings)
test_dataset = Dataset(test_encodings)

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=5e-5)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

for epoch in range(6):
    model.train()

    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        loss = outputs[0]
        loss.backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 167/167 [05:12<00:00,  1.87s/it, loss=2.19]
Epoch 1: 100%|██████████| 167/167 [05:11<00:00,  1.87s/it, loss=2.07]
Epoch 2: 100%|██████████| 167/167 [05:12<00:00,  1.87s/it, loss=0.686]
Epoch 3: 100%|██████████| 167/167 [05:11<00:00,  1.87s/it, loss=0.0859]
Epoch 4: 100%|██████████| 167/167 [05:11<00:00,  1.87s/it, loss=0.0986]
Epoch 5: 100%|██████████| 167/167 [05:11<00:00,  1.87s/it, loss=0.094]


In [None]:
model_path = '/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned/tokenizer_config.json',
 '/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned/special_tokens_map.json',
 '/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned/vocab.txt',
 '/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned/added_tokens.json',
 '/content/drive/MyDrive/models/loodos-bert-base-uncased-QA-fine-tuned/tokenizer.json')

In [None]:
model.eval()

val_loader = DataLoader(test_dataset, batch_size=16)

acc = []

loop = tqdm(val_loader)

for batch in loop:

    with torch.no_grad():

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)

        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)

        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())

acc = sum(acc)/len(acc)

100%|██████████| 84/84 [01:51<00:00,  1.33s/it]


In [None]:
print(acc)

0.9122023809523809


In [None]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	201	202
pred	201	202

true	7	12
pred	7	12



## Model Load & Manuel Test

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("oguzhanolm/loodos-bert-base-uncased-QA-fine-tuned")

model = AutoModelForQuestionAnswering.from_pretrained("oguzhanolm/loodos-bert-base-uncased-QA-fine-tuned")

In [None]:
from transformers import pipeline
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [None]:
def ask(question,context):
  temp = nlp(question=question, context=context)
  start_idx = temp["start"]
  end_idx = temp["end"]
  return context[start_idx:end_idx]

In [None]:
istanbul="İstanbul, Türkiye'de Marmara Bölgesi'nde yer alan şehir ve Türkiye Cumhuriyeti Devletinin 81 ilinden biridir. Ülkenin nüfus bakımından en çok göç alan ve en kalabalık ilidir. Ekonomik, tarihî ve sosyo-kültürel açıdan önde gelen şehirlerden biridir. Şehir, iktisadi büyüklük açısından dünyada 34. sırada yer alır. Nüfuslarına göre şehirler listesinde belediye sınırları göz önüne alınarak yapılan sıralamaya göre Avrupa'da birinci, dünyada ise altıncı sırada yer almaktadır."

In [None]:
soru1 = "İstanbul büyüklük açısından kaçıncı sıradadır?"
cevap1 = ask(soru1,istanbul)
print(cevap1)

34.


In [None]:
soru2 = "İstanbul nerede bulunur?"
cevap2 = ask(soru2,istanbul)
print(cevap2)

Türkiye'de Marmara Bölgesi'nde
