In [1]:
import tensorflow as tf
from transformers import AutoTokenizer ,TFAutoModelForMultipleChoice
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [5]:
model_name = 'bert-base-uncased'

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForMultipleChoice.from_pretrained(model_name)

All PyTorch model weights were used when initializing TFBertForMultipleChoice.

Some weights or buffers of the TF 2.0 model TFBertForMultipleChoice were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
df = pd.read_csv('train.csv')

In [8]:
df.columns

Index(['id', 'prompt', 'A', 'B', 'C', 'D', 'E', 'answer'], dtype='object')

In [9]:
questions = df.prompt.tolist()

In [10]:
answer_options = df[['A','B','C','D','E']].values.tolist()

In [11]:
answer = encoder.fit_transform(df.answer)

In [12]:
train_q,val_q,train_ans_opt, val_ans_opt, train_labels,val_labels = train_test_split(
    questions,
    answer_options,
    answer,
    test_size=0.2,
    random_state=10
)

In [13]:
train_opt = train_ans_opt[0]
train_opt

['The Heisenberg uncertainty principle states that the axis of rotation of a quantum particle is undefined, and that quantum particles possess a type of non-orbital angular momentum called "spin". This is because angular momentum, like other quantities in quantum mechanics, is expressed as a tensorial operator in relativistic quantum mechanics.',
 'The Heisenberg uncertainty principle states that the total angular momentum of a system of particles is equal to the sum of the individual particle angular momenta, and that the centre of mass is for the system. This is because angular momentum, like other quantities in quantum mechanics, is expressed as an operator with quantized eigenvalues.',
 'The Heisenberg uncertainty principle states that the total angular momentum of a system of particles is subject to quantization, and that the individual particle angular momenta are expressed as operators. This is because angular momentum, like other quantities in quantum mechanics, is subject to t

In [14]:
num_epochs = 5
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss_func = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [15]:
for epoch in range(num_epochs):
    train_loss = 0.0
    val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    for i in tqdm(range(len(train_q)), desc = f'Epoch {epoch+1}'):
        question = train_q[i]
        answer_options_batch = train_ans_opt[i]
        label = train_labels[i]

        questions_encoded = tokenizer(question,padding='max_length',truncation=True,max_length=150,return_tensors='tf')
        answer_option_encoded = tokenizer(answer_options_batch, padding='max_length', truncation=True, max_length=150, return_tensors='tf')
        
        answer_input_ids = []
        answer_attention_masks = []
        for answer_option in answer_options_batch:
            answer_encoded = tokenizer(answer_option, padding='max_length', truncation=True, max_length=150, return_tensors='tf')
            answer_input_ids.append(answer_encoded['input_ids'])
            answer_attention_masks.append(answer_encoded['attention_mask'])

        answer_input_ids = tf.stack(answer_input_ids)
        answer_attention_masks = tf.stack(answer_attention_masks)

        input_ids = tf.concat([tf.expand_dims(questions_encoded['input_ids'], axis=0), answer_input_ids], axis=0)
        attention_mask = tf.concat([tf.expand_dims(questions_encoded['attention_mask'], axis=0), answer_attention_masks], axis=0)
        
        with tf.GradientTape() as tape:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,labels = tf.convert_to_tensor([label]))
            logits = outputs.logits
            loss = loss_func(tf.convert_to_tensor([label]), logits)

        gradients = tape.gradient(loss,model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        train_loss += loss.numpy()


    for i in range(len(val_q)):
        question = val_q[i]
        answer_option_batch = val_ans_opt[i]
        label = val_labels[i]

        questions_encoded = tokenizer(question,padding='max_length',truncation=True,max_length=150,return_tensors='tf')
        answer_option_encoded = tokenizer(answer_option_batch,padding='max_length',truncation=True,max_length=150,return_tensors='tf')
        
        answer_input_ids = []
        answer_attention_mask = []
        for answer_option in answer_option_batch:
            answer_encoded = tokenizer(answer_option,padding='max_length',truncation=True ,max_length = 150, return_tensors='tf')
            answer_input_ids.append(answer_encoded['input_ids'])
            answer_attention_mask.append(answer_encoded['attention_mask'])
            
        answer_input_ids = tf.stack(answer_input_ids)
        answer_attention_mask = tf.stack(answer_attention_mask)

        input_ids = tf.concat([tf.expand_dims(questions_encoded['input_ids'], axis=0), answer_input_ids], axis=0)
        attention_mask = tf.concat([tf.expand_dims(questions_encoded['attention_mask'], axis=0), answer_attention_masks], axis=0)
        
        
        outputs = model(input_ids=input_ids,attention_mask=attention_mask,labels = tf.convert_to_tensor([label]))
        logits = outputs.logits
        loss = loss_func(tf.convert_to_tensor([label]), logits)
        val_loss += loss.numpy()

        predicted_label = tf.argmax(logits,axis = -1)[0]
        if predicted_label == label:
            correct_predictions += 1
        total_predictions += 1

    avg_train_loss = train_loss / len(train_q)
    avg_val_loss = val_loss / len(val_q)
    val_accuracy = correct_predictions / total_predictions
    
    print(f"Epoch {epoch + 1}:")
    print(f"  Training loss: {avg_train_loss:.4f}")
    print(f"  Validation loss: {avg_val_loss:.4f}")
    print(f"  Validation accuracy: {val_accuracy:.4f}")

Epoch 1: 100%|███████████████████████████████████████████████████████████████████████| 160/160 [10:18<00:00,  3.86s/it]


Epoch 1:
  Training loss: 1.7975
  Validation loss: 1.7928
  Validation accuracy: 0.1750


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████| 160/160 [10:30<00:00,  3.94s/it]


Epoch 2:
  Training loss: 1.6841
  Validation loss: 1.8093
  Validation accuracy: 0.1750


Epoch 3: 100%|███████████████████████████████████████████████████████████████████████| 160/160 [10:28<00:00,  3.93s/it]


Epoch 3:
  Training loss: 1.2459
  Validation loss: 2.3157
  Validation accuracy: 0.1750


Epoch 4: 100%|███████████████████████████████████████████████████████████████████████| 160/160 [10:28<00:00,  3.93s/it]


Epoch 4:
  Training loss: 0.7535
  Validation loss: 2.0368
  Validation accuracy: 0.1750


Epoch 5: 100%|███████████████████████████████████████████████████████████████████████| 160/160 [10:39<00:00,  4.00s/it]


Epoch 5:
  Training loss: 0.4072
  Validation loss: 1.9945
  Validation accuracy: 0.1750
