In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from datasets import load_dataset
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

# Load the multilingual BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Load the Atypical Animacy dataset
dataset = load_dataset("biglam/atypical_animacy")

# Ensure dataset is in a format that can be split into train and test
if "train" not in dataset or "test" not in dataset:
    dataset = dataset["train"].train_test_split(test_size=0.2)

def tokenize_function(examples):
    return tokenizer(examples["sentence"], examples["context"], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
dataset = dataset.map(tokenize_function, batched=True)

# Convert target labels into numeric format (fixing column name typo)
unique_labels = list(set(dataset["train"]["target"]))
label_map = {label: i for i, label in enumerate(unique_labels)}
dataset = dataset.map(lambda x: {"label": label_map[x["target"]]})

# Load the pre-trained model
num_labels = len(unique_labels)
print("unique_labels: ",unique_labels)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="macro")
    return {"precision": precision, "recall": recall, "f1_score": f1}

# Function to extract embeddings for a single example
def get_embedding(sentence, context):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(sentence, context, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model.bert(**inputs)  # Get BERT embeddings
    return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # Extract [CLS] token embedding

# Example: Get embeddings for the first row of the dataset
example = dataset["train"][0]
embedding_vector = get_embedding(example["sentence"], example["context"])
print("Embedding vector for the first sample:", embedding_vector)

# Training arguments for quick training
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=12,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=5,
    logging_strategy="epoch",
    report_to=["none"]  # Disable logging to external services
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Get predictions on the test set
def get_predictions():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    test_texts = dataset["test"]["sentence"]
    test_contexts = dataset["test"]["context"]
    test_labels = dataset["test"]["label"]
    inputs = tokenizer(test_texts, test_contexts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = np.argmax(outputs.logits.cpu().numpy(), axis=-1)
    
    for text, context, actual, pred in zip(test_texts, test_contexts, test_labels, predictions):
        print(f"Sentence: {text}\nContext: {context}\nActual Label: {actual}\nPredicted Label: {pred}\n")

get_predictions()

# Save the fine-tuned model
model.save_pretrained("./finetuned_bert_multilingual")
tokenizer.save_pretrained("./finetuned_bert_multilingual")

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/475 [00:00<?, ? examples/s]

Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


unique_labels:  ['engine', 'boilers', 'motors', 'locomotive', 'dynamo', 'machine', 'boiler', 'dynamos', 'machinery', 'engines', 'apparatus', 'locomotives', 'machines']


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Embedding vector for the first sample: [-3.43241930e-01  3.42328012e-01 -1.35869116e-01  1.15232840e-01
  4.36432004e-01 -1.78112373e-01  8.56565088e-02 -6.33621365e-02
 -3.82106423e-01  2.78311968e-01 -2.59767333e-03 -1.80122480e-01
 -1.20940083e-03  6.11479543e-02 -4.84012157e-01 -2.35006765e-01
 -2.45808870e-01 -2.33843829e-02  3.17298770e-01 -2.22196043e-01
  8.09933525e-03  7.18193799e-02  1.49243653e-01 -2.73624718e-01
 -1.22972824e-01 -8.78355578e-02  2.08005756e-01  2.94408262e-01
  5.83765566e-01  9.66886580e-02  4.72047590e-02  2.00836375e-01
 -1.41943634e-01  3.43674779e-01 -8.09847936e-02  1.23787997e-02
 -2.20250988e+00 -1.58351481e-01 -1.20233074e-01 -8.90154541e-02
 -1.22283675e-01  8.42054263e-02 -1.60227776e-01  1.96810141e-01
  3.11215043e-01  1.54504347e+00 -2.59789318e-01 -4.88483831e-02
  1.79606736e+00  4.57616113e-02 -5.45289889e-02 -7.89383650e-01
 -1.34861007e-01 -1.86781383e+00  1.68611780e-01  1.28855094e-01
  1.39333129e-01  9.67940837e-02  1.19415395e-01 -3



Epoch,Training Loss,Validation Loss,Precision,Recall,F1 Score
1,2.1125,1.811145,0.106312,0.20769,0.138776
2,1.3392,0.882666,0.511777,0.518703,0.484354
3,0.6011,0.595152,0.52501,0.605801,0.548906
4,0.3224,0.417495,0.808537,0.797931,0.793862
5,0.1983,0.359926,0.810306,0.797931,0.793232
6,0.1384,0.377198,0.81671,0.797931,0.797333
7,0.1021,0.359748,0.795591,0.797931,0.788062
8,0.0803,0.395434,0.812333,0.797931,0.79725
9,0.0615,0.384931,0.810537,0.797931,0.796638
10,0.0517,0.396226,0.810537,0.797931,0.796638


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation Results: {'eval_loss': 0.4016988277435303, 'eval_precision': 0.812333174742595, 'eval_recall': 0.7979306958473624, 'eval_f1_score': 0.797249829144224, 'eval_runtime': 1.415, 'eval_samples_per_second': 84.098, 'eval_steps_per_second': 10.601, 'epoch': 12.0}


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Sentence: During the whole of these experiments the engine was doing < its work as usual, occasionally going and occasionally standing; j but no difference was observed in the electricity given off by t the steam.
Context: j From this it would appear that the steam of both boilers was in the same electrical condition.  During the whole of these experiments the engine was doing < its work as usual, occasionally going and occasionally standing; j but no difference was observed in the electricity given off by t the steam.  1 have been most careful to supply an exact account of the facts ofthis extraordinary, and, as far as I know, unprecedented ease, but I do not offer any theory to account for the pheno- j inena.
Actual Label: 0
Predicted Label: 0

Sentence: Without waiting for a word of objection or reply, he turned away, He went up-stairs to talk to Madame Lascours ; he left her alone in his study to perform the office of a calculating machine .
Context: This is not an open battle, but

('./finetuned_bert_multilingual/tokenizer_config.json',
 './finetuned_bert_multilingual/special_tokens_map.json',
 './finetuned_bert_multilingual/vocab.txt',
 './finetuned_bert_multilingual/added_tokens.json')