In [275]:
import os
import json
import pickle
import numpy as np
import pandas as pd

autocast_questions = json.load(open('../competition/autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('../competition/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

In [277]:
autocast_questions_df = pd.DataFrame(autocast_questions)
test_questions_df = pd.DataFrame(test_questions)
test_ids_df = pd.DataFrame(test_ids)

In [323]:
autocast_true_false_df = autocast_questions_df[autocast_questions_df['qtype'] == 't/f']

In [325]:
autocast_true_false_df.loc[2, 'question']

'Will there be an initial public offering on either the Shanghai Stock Exchange or the Shenzhen Stock Exchange before 1 January 2016?'

In [332]:
question = []
answer = []
context = []

for index, row in autocast_true_false_df.iterrows():
    if row['id'] in test_ids: # skipping questions in the competition test set
        continue
    if row['answer'] is None: # skipping questions in the competition test set
        continue
    if row['qtype'] == 't/f':
        question.append(row['question'])
        answer.append(row['answer'])
        context.append(row['background'])

In [335]:
true_false_question_df = pd.DataFrame({'question': question, 'answer': answer, 'context': context})


In [337]:
true_false_question_df.head()

Unnamed: 0,question,answer,context
0,Will there be an initial public offering on ei...,yes,China suspended initial public offerings (IPOs...
1,Will the Export-Import Bank of the United Stat...,yes,The Export-Import Bank's authorization expired...
2,Will a trilateral meeting take place between C...,no,"A trilateral meeting of leaders from China, Ja..."
3,Will Iran release Jason Rezaian before 31 Octo...,yes,For details of the case involving Jason Rezaia...
4,Will North Korea launch a land based missile w...,yes,A launch for military or testing purposes woul...


In [338]:
true_false_question_df.to_csv('true-false.csv')

In [339]:
true_false_question_df['answer'].value_counts()

no     1140
yes     450
Name: answer, dtype: int64

In [340]:
len(true_false_question_df['answer'])

1590

In [346]:
true_false_question_df.head()

Unnamed: 0,question,answer,context
0,Will there be an initial public offering on ei...,yes,China suspended initial public offerings (IPOs...
1,Will the Export-Import Bank of the United Stat...,yes,The Export-Import Bank's authorization expired...
2,Will a trilateral meeting take place between C...,no,"A trilateral meeting of leaders from China, Ja..."
3,Will Iran release Jason Rezaian before 31 Octo...,yes,For details of the case involving Jason Rezaia...
4,Will North Korea launch a land based missile w...,yes,A launch for military or testing purposes woul...


In [347]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [357]:
# Define question and context

answers = []
def predict_yes_no(question, context):

    # Tokenize question and context
    encoded_dict = tokenizer.encode_plus(question, context, padding='max_length', truncation=True, return_tensors='pt')

    # Classify as yes/no using the model
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        probabilities = torch.softmax(logits, dim=1).squeeze()
    
    if probabilities[0] > probabilities[1]:
        answers.append('No')
    else:
        answers.append('Yes')

In [358]:
for index, row in autocast_true_false_df.iterrows():
    if row['id'] in test_ids: # skipping questions in the competition test set
        continue
    if row['answer'] is None: # skipping questions without answer
        continue
    
    question = row['question']
    context = row['background']
    predict_yes_no(question, context)

print('Completed')


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Completed


In [359]:
answers_df = pd.DataFrame(answers)

In [361]:
answers_df.head()

Unnamed: 0,0
0,No
1,Yes
2,Yes
3,Yes
4,Yes


In [362]:
answers_df.rename(columns={0: 'Answer'}, inplace=True)


In [363]:
answers_df

Unnamed: 0,Answer
0,No
1,Yes
2,Yes
3,Yes
4,Yes
...,...
1585,No
1586,No
1587,No
1588,Yes


In [366]:
actual_answer_df = pd.DataFrame(autocast_true_false_df['answer'])

In [393]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [None]:
def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        pred_idx = np.argmax(np.random.random(size=2))
        pred = np.ones(2)
        pred[pred_idx] += 1e-5
        return pred / pred.sum()

In [395]:
# Define question and context

answers = []
def predict_yes_no_v2(question, context):

    # Tokenize question and context
    encoded_dict = tokenizer.encode_plus(question, context, padding='max_length', truncation=True, return_tensors='pt')

    # Classify as yes/no using the model
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        probabilities = torch.softmax(logits, dim=1).squeeze()

    print(probabilities[0])
    print(probabilities[1])
    
    if probabilities[0] > probabilities[1]:
        answers.append('No')
    else:
        answers.append('Yes')

In [400]:
count = 0
for index, row in autocast_true_false_df.iterrows():
    count += 1
    if row['id'] in test_ids: # skipping questions in the competition test set
        continue
    if row['answer'] is None: # skipping questions without answer
        continue
    
    question = row['question']
    context = row['background']
    predict_yes_no_v2(question, context)
    if count == 2:
        break
print('Completed')


tensor(0.5081)
tensor(0.4919)
tensor(0.4711)
tensor(0.5289)
Completed


array([0., 0.])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at