In [None]:
!pip install datasets
!pip install transformers

In [None]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import json

dataset = load_dataset('multi_eurlex', language='en',label_level='level_2',trust_remote_code=True)
dataset3 = load_dataset('multi_eurlex', language='fr',label_level='level_2',trust_remote_code=True)

df=pd.DataFrame(dataset['train'])
df3=pd.DataFrame(dataset3['train'])

In [None]:

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
encoded_inputs=tokenizer.batch_encode_plus(list(df3.text),max_length=512, padding='max_length',truncation=True)
encoded_inputs_val=tokenizer.batch_encode_plus(list(df.text),max_length=512, padding='max_length',truncation=True)
encoded_inputs["labels"] = encoded_inputs_val["input_ids"]
from datasets import Dataset
tokenized_dataset=Dataset.from_dict(encoded_inputs)

In [None]:
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer

model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr")

training_args = TrainingArguments(
    output_dir='./finetuned_en_fr',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=3,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    save_total_limit=2,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

trainer.train()
model.save_pretrained("drive/MyDrive/finetuned-en-fr")
tokenizer.save_pretrained("drive/MyDrive/finetuned-en-fr")



In [None]:
from transformers import pipeline

translator = pipeline("translation", model=model, tokenizer=tokenizer)
translations = translator(list(df2.text), max_length=512, truncation=True)

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset,random_split

# Load the pre-trained EuroVoc classifier
model_name = "FacebookAI/xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=127)
device=torch.device('cuda')
model=model.to(device)
loss_fn=nn.BCEWithLogitsLoss()
model.load_state_dict(torch.load('drive/MyDrive/best_model_nonfrozen.pt'))

In [None]:

encoded_inputs_val=tokenizer.batch_encode_plus(translations, max_length=512, padding='max_length',truncation=True)
labels_val=torch.tensor(df2.drop(columns=['text']).values,dtype=torch.float)

full_test_set=TensorDataset(torch.tensor(encoded_inputs_val['input_ids']),torch.tensor(encoded_inputs_val['attention_mask']),labels_val)

test_loader=DataLoader(full_test_set,batch_size=8,shuffle=False)

In [None]:
val_losses=[]
val_accs=[]
r_scores=[]
with torch.no_grad():
  model.eval()
  for step,(input_ids,attention_mask,labe) in enumerate(test_loader):
    outputs=model(input_ids=input_ids.to(device),attention_mask=attention_mask.to(device))
    loss=loss_fn(outputs.logits,labe.to(device))
    val_losses.append(float(loss.cpu()))
    targ_labs=(outputs.logits>0).float().cpu()
    val_accs.append((targ_labs==labe.cpu()).float().mean())
    r_score=r_precision(labe.cpu(),outputs.logits.cpu())
    r_scores.append(r_score)

vloss=np.array(val_losses).mean()
r_score=np.array(r_scores).mean()
print(f'Test on french: Val Loss={vloss:0.4f}; Val accuracy={np.array(val_accs).mean():0.4f}; R={r_score:0.4f}')