In [1]:
import os
import json
import pickle
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [2]:
autocast_questions = json.load(open('../competition/autocast_questions.json')) # from the Autocast dataset
test_questions = json.load(open('../competition/autocast_competition_test_set.json'))
test_ids = [q['id'] for q in test_questions]

In [3]:
autocast_questions_df = pd.DataFrame(autocast_questions)
autocast_questions_df.head()

Unnamed: 0,question,id,background,publish_time,close_time,tags,source_links,prediction_count,forecaster_count,answer,choices,status,qtype,crowd
0,What will the end-of-day closing value for the...,G1,Outcome will be determined by the end-of-day c...,2015-09-01 13:49:29.860000+00:00,2016-01-01 17:00:01+00:00,"[Finance, Economic Indicators]",[http://ftalphaville.ft.com/2015/08/17/2137329...,1549,385,D,"[Less than 6.30, Between 6.30 and 6.35, inclus...",Resolved,mc,"[{'timestamp': '2015-09-01 00:00:00+00:00', 'f..."
1,How many seats will the Justice and Developmen...,G2,The Justice and Development Party (AKP) failed...,2015-09-01 13:54:25.050000+00:00,2015-11-01 22:00:20+00:00,"[Elections and Referenda, Non-US Politics]",[http://www.al-monitor.com/pulse/originals/201...,567,194,A,"[A majority, A plurality, Not a plurality]",Resolved,mc,"[{'timestamp': '2015-09-01 00:00:00+00:00', 'f..."
2,Will there be an initial public offering on ei...,G4,China suspended initial public offerings (IPOs...,2015-09-01 13:58:30.138000+00:00,2015-11-30 14:00:15+00:00,[Finance],[http://atimes.com/2015/11/china-will-allow-su...,545,148,yes,"[yes, no]",Resolved,t/f,"[{'timestamp': '2015-09-01 00:00:00+00:00', 'f..."
3,Will the Export-Import Bank of the United Stat...,G5,The Export-Import Bank's authorization expired...,2015-09-01 14:02:21.242000+00:00,2015-12-04 14:00:25+00:00,"[Economic Policy, US Politics, US Policy]",[http://thehill.com/policy/finance/260118-week...,1000,379,yes,"[yes, no]",Resolved,t/f,"[{'timestamp': '2015-09-01 00:00:00+00:00', 'f..."
4,Will a trilateral meeting take place between C...,G6,"A trilateral meeting of leaders from China, Ja...",2015-09-01 14:04:41.470000+00:00,2015-12-31 23:00:11+00:00,[Foreign Policy],"[https://en.wikipedia.org/wiki/Li_Keqiang, htt...",946,385,no,"[yes, no]",Resolved,t/f,"[{'timestamp': '2015-09-01 00:00:00+00:00', 'f..."


In [4]:
# Create a new dataframe with three columns
mcq_df = pd.DataFrame(columns=["question", "choices", "answer"])
true_false_df = pd.DataFrame(columns=["question", "choices", "answer"])

# Iterate over the autocast_questions_df dataframe
for index, row in autocast_questions_df.iterrows():
    if row['id'] in test_ids: # skipping questions in the competition test set
        continue
    if row['answer'] is None: # skipping questions without answer
        continue
    if row['qtype'] == 'mc':
    # Extract the question and choices from the row
        question_mc = row["question"]
        choices_mc = row["choices"]
        answer_mc = row['answer']

        # Add a new row to the new dataframe
        new_row_mcq = {"question": question_mc, "choices": choices_mc, "answer": answer_mc}
        mcq_df = pd.concat([mcq_df, pd.DataFrame([new_row_mcq])], ignore_index=True)
    
    # Extract the question and choices from the row
    elif row['qtype'] == 't/f':
        question_tf = row["question"]
        choices_tf = row["choices"]
        answer_tf = row['answer']

        # Add a new row to the new dataframe
        new_row_tf = {"question": question_tf, "choices": choices_tf, "answer": answer_tf}
        true_false_df = pd.concat([true_false_df, pd.DataFrame([new_row_tf])], ignore_index=True)


In [5]:
mcq_df.head()
mcq_df.to_csv('../competition/mcq-data.csv')

In [6]:
true_false_df.head()
true_false_df.to_csv('../competition/true-false.csv')

In [7]:
df = pd.read_csv("true-false.csv")

In [8]:
df.head()
df.drop('Unnamed: 0', axis = 1, inplace = True)

In [9]:
 df.question.values

array(['Will there be an initial public offering on either the Shanghai Stock Exchange or the Shenzhen Stock Exchange before 1 January 2016?',
       'Will the Export-Import Bank of the United States be re-authorized before 1 January 2016?',
       'Will a trilateral meeting take place between Chinese President Xi Jinping, Japanese Prime Minister Shinzo Abe, and South Korean President Park Geun-hye before 1 January 2016?',
       ...,
       'Will the US experience a 4th wave of COVID before June 1, 2021?',
       'Will Brazil have a 7-day rolling average above 2,500 COVID-19 deaths before 1 June 2021?',
       'Will Derek Chauvin be convicted of homicide by June 1 2021?'],
      dtype=object)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

encoded_data = tokenizer.batch_encode_plus(
    df['question'],
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt',
    truncation = True,
    padding = 'max_length'
)


In [11]:
answer = df.answer.values
answer[answer == 'yes'] = 1
answer[answer == 'no'] = 0
labels = torch.tensor(answer.astype('int64'))

In [12]:
print(labels.shape)
# df['answer']

torch.Size([1590])


In [13]:
encoded_data['input_ids']

tensor([[ 101, 2097, 2045,  ...,    0,    0,    0],
        [ 101, 2097, 1996,  ...,    0,    0,    0],
        [ 101, 2097, 1037,  ...,    0,    0,    0],
        ...,
        [ 101, 2097, 1996,  ...,    0,    0,    0],
        [ 101, 2097, 4380,  ...,    0,    0,    0],
        [ 101, 2097, 7256,  ...,    0,    0,    0]])

In [14]:
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']

dataset = torch.utils.data.TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])


In [15]:
model = BertForSequenceClassification.from_pretrained(
    "bert-large-uncased",
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

batch_size = 32
epochs = 5
learning_rate = 2e-5
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

In [16]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)

val_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True
)


In [17]:
# Step 8: Define the device and move model and data to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(1, epochs+1):
    model.train()

    total_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print("Training loss: {0:.2f}".format(avg_train_loss))

    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        input_ids = batch[0].to(device)
        attention_masks = batch[1].to(device)
        labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs[0]
        preds = torch.argmax(logits, dim=1).flatten()
        accuracy = torch.sum(preds == labels).item()
        val_accuracy += accuracy
    avg_val_accuracy = val_accuracy / len(val_dataset)
    print("Validation Accuracy: {0:.2f}".format(avg_val_accuracy))

    torch.save(model.state_dict(), 'model.pth')


Training loss: 0.62
Validation Accuracy: 0.74
Training loss: 0.59
Validation Accuracy: 0.74
Training loss: 0.56
Validation Accuracy: 0.74
Training loss: 0.48
Validation Accuracy: 0.69
Training loss: 0.37
Validation Accuracy: 0.61


In [18]:
# model

In [19]:
def random_baseline_model(question):
    if question['qtype'] == 't/f':
        return np.random.random(size=2)
    elif question['qtype'] == 'mc':
        probs = np.random.random(size=len(question['choices']))
        return probs / probs.sum()
    elif question['qtype'] == 'num':
        return np.random.random()

def caliberated_tf_pred(question):

    # Define the question and choices
    actual_question = question['question']
    tags = question['tags']
    choice1 = question['choices'][0]
    choice2 = question['choices'][1]

    # Combine the question and choices into a single input string
    input_text = [f"{actual_question} {choice1} {choice2}"]
    
    # Tokenize the input string and convert to tensors
    inputs = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length= 512)

    # Make a prediction with the model
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)

    return probs.detach().flatten().numpy()

# def caliberated_mcq_pred(question):

#     # Define the question and choices
#     actual_question = question['question']
#     choices = question['choices']

#     # Combine the question and choices into a single input string
#     input_text = [f"{actual_question} {choice}" for choice in choices]
#     # print(input_text)
    
#     # Tokenize the input string and convert to tensors
#     inputs = mcq_tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

#     # Make a prediction with the model
#     outputs = mcq_model(**inputs)
#     probs = torch.softmax(outputs.logits, dim=1)
    
#     choice_probs = probs[:, 1].detach().flatten().numpy()

#     return choice_probs

# def caliberated_mcq_pred(question):

#     # Define the question and choices
#     actual_question = question['question']
#     choices = question['choices']

#     # Combine the question and choices into a single input string
#     input_text = [actual_question] + choices
#     # print(input_text)
    
#     # Tokenize the input string and convert to tensors
#     inputs = mcq_tokenizer(input_text, return_tensors='pt', padding=True, truncation=True)

#     # Make a prediction with the model
#     outputs = mcq_model(**inputs)
#     probs = torch.softmax(outputs.logits, dim=1)

#     return probs.detach().flatten().numpy()[:len(choices)]


def calibrated_random_baseline_model(question):
    if question['qtype'] == 't/f':
        # pred_idx = np.argmax(np.random.random(size=2))
        # pred = np.ones(2)
        # pred[pred_idx] += 1e-3
        # return pred / pred.sum()
        return caliberated_tf_pred(question)
    elif question['qtype'] == 'mc':
        pred_idx = np.argmax(np.random.random(size=len(question['choices'])))
        pred = np.ones(len(question['choices']))
        pred[pred_idx] += 1e-3
        return pred / pred.sum()
        # return caliberated_mcq_pred(question)
    elif question['qtype'] == 'num':
        #return 0.40
        return np.random.uniform(0.38, 0.43)

In [20]:
def brier_score(probabilities, answer_probabilities):
    return ((probabilities - answer_probabilities) ** 2).sum() / 2

In [21]:
preds = []
answers = []
qtypes = []
for question in autocast_questions:
    if question['id'] in test_ids: # skipping questions in the competition test set
        continue
    if question['answer'] is None: # skipping questions without answer
        continue
    preds.append(calibrated_random_baseline_model(question))
    if question['qtype'] == 't/f':
        ans_idx = 0 if question['answer'] == 'no' else 1
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('t/f')
    elif question['qtype'] == 'mc':
        ans_idx = ord(question['answer']) - ord('A')
        ans = np.zeros(len(question['choices']))
        ans[ans_idx] = 1
        qtypes.append('mc')
    elif question['qtype'] == 'num':
        ans = float(question['answer'])
        qtypes.append('num')
    answers.append(ans)

In [22]:
tf_results, mc_results, num_results = [],[],[]

for p, a, qtype in zip(preds, answers, qtypes):
    if qtype == 't/f':
        tf_results.append(brier_score(p, a))
    elif qtype == 'mc':
        mc_results.append(brier_score(p, a))
    else:
        num_results.append(np.abs(p - a))

print(f"T/F: {np.mean(tf_results)*100:.2f}, MCQ: {np.mean(mc_results)*100:.2f}, NUM: {np.mean(num_results)*100:.2f}")
print(f"Combined Metric: {(np.mean(tf_results) + np.mean(mc_results) + np.mean(num_results))*100:.2f}")

T/F: 19.36, MCQ: 38.05, NUM: 21.18
Combined Metric: 78.58


In [23]:
preds = []
for question in test_questions:
    preds.append(calibrated_random_baseline_model(question))

In [24]:
if not os.path.exists('submission'):
    os.makedirs('submission')

with open(os.path.join('submission', 'predictions.pkl'), 'wb') as f:
    pickle.dump(preds, f, protocol=2)

!cd submission && zip ../submission.zip ./* && cd ..

updating: predictions.pkl (deflated 64%)
