In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd

In [2]:
autocast_questions = json.load(open('../competition/autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('../competition/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

In [3]:
autocast_questions_df = pd.DataFrame(autocast_questions)

In [9]:
autocast_questions_df.head()
print(len(autocast_questions_df))

6532


In [23]:
# Create a new dataframe with three columns
mcq_df = pd.DataFrame(columns=["question", "choices", "answer"])
true_false_df = pd.DataFrame(columns=["question", "choices", "answer"])

# Iterate over the autocast_questions_df dataframe
for index, row in autocast_questions_df.iterrows():
    if row['id'] in test_ids: # skipping questions in the competition test set
        # print(row['id'])
        continue
    if row['answer'] is None: # skipping questions without answer
        continue
        
    if row['qtype'] == 'mc':
    # Extract the question and choices from the row
        
        question_mc = row["question"]
        id = row['id']
        background_mc = row['background']
        tags_mc = row['tags']
        choices_mc = row["choices"]
        answer_mc = row['answer']

        # Add a new row to the new dataframe
        new_row_mcq = {"question": question_mc, "id": id, "background": background_mc, "tags": tags_mc, "choices": choices_mc, "answer": answer_mc}
        mcq_df = pd.concat([mcq_df, pd.DataFrame([new_row_mcq])], ignore_index=True)
    
    # Extract the question and choices from the row
    elif row['qtype'] == 't/f':
        question_tf = row["question"]
        id = row['id']
        background_tf = row['background']
        tags_tf = row['tags']
        choices_tf = row["choices"]
        answer_tf = row['answer']

        # Add a new row to the new dataframe
        new_row_tf = {"question": question_tf, "id": id, "background": background_tf, "tags": tags_tf, "choices": choices_tf, "answer": answer_tf}
        true_false_df = pd.concat([true_false_df, pd.DataFrame([new_row_tf])], ignore_index=True)


In [24]:
mcq_df.head()
mcq_df.to_csv('../competition/mcq-data.csv')

In [25]:
true_false_df.head()
true_false_df.to_csv('../competition/true-false.csv')

In [27]:
true_false_df.head()
print(len(true_false_df))

1590


In [8]:
print(len(true_false_df[true_false_df['answer'] == 'yes']))
print(len(true_false_df[true_false_df['answer'] == 'no']))
print(len(true_false_df))

450
1140
1590


In [12]:
autocast_questions_df.loc[1, 'question'] 

"How many seats will the Justice and Development Party (AKP) win in Turkey's snap elections?"

In [13]:
np.random.random(size=len(autocast_questions_df.loc[1, 'choices'] ))

array([0.99264347, 0.27038891, 0.90435705])

## Create baseline models outputting random answers

## Get performance on the Autocast train set

Note that the Autocast dataset contains questions in the competition test set. Those should not be used.

In [91]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = AutoModelForMultipleChoice.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [14]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()

def caliberated_tf_pred(question):

    # Define the question and choices
    actual_question = question['question']
    choice1 = question['choices'][0]
    choice2 = question['choices'][1]

    # Combine the question and choices into a single input string
    input_text = [f"{actual_question} {choice1} {choice2}"]
    
    # Tokenize the input string and convert to tensors
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length= 256)

    # Make a prediction with the model
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)

    return probs.detach().flatten().numpy()

def caliberated_mcq_pred(question):

    # Define the question and choices
    actual_question = question['question']
    choices = question['choices']

    # Combine the question and choices into a single input string
    input_text = [f"{actual_question} {choice}" for choice in choices]
    # print(input_text)
    
    # Tokenize the input string and convert to tensors
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

    # Make a prediction with the model
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    
    choice_probs = probs[:, 1].detach().flatten().numpy()

    return choice_probs

# def caliberated_mcq_pred(question):

#     # Define the question and choices
#     actual_question = question['question']
#     choices = question['choices']

#     # Combine the question and choices into a single input string
#     input_text = [actual_question] + choices
#     print(input_text)
    
#     # Tokenize the input string and convert to tensors
#     inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

#     # Make a prediction with the model
#     outputs = model(**inputs)
#     probs = torch.softmax(outputs.logits, dim=1)

#     return probs.detach().flatten().numpy()[:len(choices)]

def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        # pred_idx = np.argmax(np.random.random(size=2))
        # pred = np.ones(2)
        # pred[pred_idx] += 1e-3
        # return pred / pred.sum()
        # return caliberated_tf_pred(question)
        return np.array([0.5, 0.5])
    elif question['qtype'] == 'mc':
        # pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        # pred = np.ones(len(question['choices']))
        # pred[pred_idx] += 1e-1
        # return pred / pred.sum()
        # return caliberated_mcq_pred(question)
        pred = np.ones(len(question['choices'])) / len(question['choices'])
        return pred

    elif question['qtype'] == 'num':
        return 0.415
        # return np.random.uniform(0.38, 0.41)

In [15]:
for question in autocast_questions:
    question1 = question['question']
    choice1 = question['choices'][0]
    choice2 = question['choices'][1]

    print(question1)
    print(choice1)
    print(choice2)
    break

What will the end-of-day closing value for the dollar against the renminbi be on 1 January 2016?
Less than 6.30
Between 6.30 and 6.35, inclusive


In [32]:
print(type(autocast_questions[1]['choices']))

<class 'list'>


In [16]:
calibrated_random_baseline_model(autocast_questions[100])

array([0.33333333, 0.33333333, 0.33333333])

In [17]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [35]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        # print(len(question['choices']))
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        print(len(question['choices']))
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)
    

4
3
3
4
3
6
2
3
4
5
4
3
3
4
4
5
4
4
7
3
2
4
5
4
7
3
3
3
3
3
12
3
12
3
8
3
3
3
2
7
6
4
3
4
4
4
2
2
5
5
3
4
5
3
5
4
3
5
3
3
5
5
3
6
3
3
5
3
3
3
4
3
3
3
3
7
3
3
3
3
4
4
4
4
3
3
3
6
3
6
3
5
4
3
5
3
3
5
5
5
4
6
3
5
3
4
7
3
3
4
5
3
4
2
9
5
5
5
5
5
5
5
5
5
5
4
5
5
3
3
5
5
5
5
5
4
5
5
4
4
4
4
4
4
4
5
5
5
5
3
5
5
4
2
5
3
3
4
3
5
5
3
4
5
5
5
3
5
4
3
3
5
5
5
5
3
3
5
5
5
5
5
5
5
4
5
5
5
5
5
3
3
5
3
4
4
4
5
4
5
3
5
3
3
4
3
5
3
3
5
5
5
5
3
3
3
5
5
5
5
5
5
4
5
5
5
5
5
5
4
5
5
4
3
3
4
5
5
3
5
3
5
5
5
5
5
5
5
5
5
3
3
3
5
5
5
3
5
5
5
5
5
5
5
4
5
6
3
5
5
4
5
5
5
4
5
9
11
4
4
6
5
3
4
4
4
4
5
12
3
3
5
5
5
5
5
5
5
5
3
5
5
5
5
5
5
5
5
5
5
5
3
3
7
3
5
5
3
3
3
3
5
5
5
4
3
5
2
3
3
3
3
3
5
5
5
5
5
5
5
4
2
5
5
5
3
6
4
4
5
3
4
4
5
4
5
6
6
4
5
5
5
5
6
2
3
4
5
8
5
5
3
5
5
5
4
3
4
4
5
3
8
8
2
5
4
5
4
5
7
5
4
6
5
10
5
5
5
5
4
5
4
5
3
5
5
5
3
4
4
4
5
5
4
5
5
5
2
3
3
5
4
5
5
4
4
4
5
5
5
5
3
4
12
4
3
6
4
4
5
4
4
4
4
5
5
4
5
4
8
5
5
3
3
3
3
3
4
4
3
4
6
6
6
6
6
6
6
6
6
6
4
6
6
5
5
9
9
9
9
6
10
6
6
6
6
6
3
3
5
5
5
4
5
3
4
5

[array([0.25, 0.25, 0.25, 0.25]),
 array([0.33333333, 0.33333333, 0.33333333]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.33333333, 0.33333333, 0.33333333]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.25, 0.25, 0.25, 0.25]),
 array([0.5, 0.5]),
 array([0.33333333, 0.33333333, 0.33333333]),
 array([0.5, 0.5]),
 array([0.16666667, 0.16666667, 0.16666667, 0.16666667, 0.16666667,
        0.16666667]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.33333333, 0.33333333, 0.33333333]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.5, 0.5]),
 array([0.25, 0.25, 0.25, 0.25]),
 array([0.5, 0.5]),
 arr

## Evaluate the model

In [31]:
tf_results, mc_results, num_results = [],[],[]

for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 25.00, MCQ: 38.05, NUM: 21.26
Combined Metric: 84.30


## Make predictions on test set

In [98]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [99]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

updating: predictions.pkl (deflated 79%)


In [74]:
list(preds)

df = pd.DataFrame({'preds': preds})
df.head()

Unnamed: 0,preds
0,"[0.5, 0.5]"
1,"[0.5, 0.5]"
2,"[0.5, 0.5]"
3,"[0.5, 0.5]"
4,"[0.2, 0.2, 0.2, 0.2, 0.2]"


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer

# Load pre-trained BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Example input
question = autocast_questions[2]['question']
choices = autocast_questions[2]['choices']
context = autocast_questions[2]['background']
tags = autocast_questions[2]['tags']

print(question)
print(choices)
print(context)
print(tags)
# Concatenate inputs
input_text = f'{question} {tags} {context} {" ".join(choices)}'

# Tokenize input
input_ids = tokenizer.encode(input_text, add_special_tokens=True, truncation = True, padding = 'max_length')

# Pad or truncate input
max_length = 128
input_ids = input_ids[:max_length] + [tokenizer.pad_token_id] * (max_length - len(input_ids))

# Convert to BERT input format
input_ids = torch.tensor([input_ids])
segment_ids = torch.zeros_like(input_ids)
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Pass through pre-trained BERT model
outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=segment_ids)
probs = torch.softmax(outputs.logits, dim=1)
print(probs.detach().flatten().numpy())
# Get predicted probabilities for yes and no
yes_prob = torch.softmax(outputs.logits, dim=1)[0, 0].item()
no_prob = torch.softmax(outputs.logits, dim=1)[0, 1].item()

print(f'Probability of "Yes": {yes_prob:.2f}')
print(f'Probability of "No": {no_prob:.2f}')


In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
model = BertForSequenceClassification.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# Define some example true/false questions
questions = [
    "Is the sky blue?",
    "Are dogs reptiles?",
    "Is water wet?",
    "Do humans have three arms?",
    "Is the earth flat?"
]

# Tokenize the questions
inputs = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")

# Generate predictions
outputs = model(inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

# Convert logits to probabilities
probs = torch.softmax(outputs[0], dim=1)

# Print probabilities for each question
for i, question in enumerate(questions):
    print(question)
    print(f"True: {probs[i][0]:.2f}, False: {probs[i][1]:.2f}")


| Column 1 | Column 2 | Column 3 |
|:--------:|:-------:|:--------:|
| Centered | Centered | Centered |
| Left-aligned | Right-aligned | Centered |
| Left-aligned | Right-aligned | Centered |
