In [1]:
# !pip install pytorch-transformers
!pip install -U datasets
!pip install transformers==4.45.2 sentence-transformers==3.1.1

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaForSequenceClassification, RobertaConfig, RobertaForMaskedLM, Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorForLanguageModeling
import torch
from scipy.spatial.distance import cosine
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, InputExample, losses, models, util
from torch.utils.data import DataLoader
import pandas as pd

In [3]:
# Încarcă dataset-ul
splits = {'train': 'final/train-00000-of-00001.parquet', 'test': 'final/test-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/readerbench/ro_fake_news/" + splits["train"])

# Extrage titlurile și corpurile de știri
headlines = df['headline']
bodies = df['body']

print(headlines[:5])
print(bodies[:5])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


0    Situația GLOBALĂ a deceselor: Dacă de Covid-19...
1    Româncă, mărturie cutremurătoare: Ne obligă să...
2    Testele COVID-19 din România, lipsite de acura...
3    Profesorul Avram Fițiu: Hrana de la țărani est...
4    VITAMINA C tratează coronavirusul COVID-19. An...
Name: headline, dtype: object
0    Pentru a putea evalua corect efectele mortale ...
1    O romanca face marturii cutremuratoare dupa ce...
2    Avem tot mai multe exemple clare care arată că...
3    Hrana țărănească ecologică cumpărată de la țăr...
4    Guvernul din Shanghai, China, a anunţat oficia...
Name: body, dtype: object


In [4]:
# Creează un tokenizer nou antrenat pe datele din headline și body
training_corpus = [text for text in headlines] + [text for text in bodies]
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
tokenizer.train_new_from_iterator(training_corpus, vocab_size=30000)

# Salvează tokenizer-ul pentru utilizări ulterioare
tokenizer.save_pretrained('./tokenizer')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/vocab.json',
 './tokenizer/merges.txt',
 './tokenizer/added_tokens.json',
 './tokenizer/tokenizer.json')

In [5]:
# Creează configurația modelului RoBERTa
config = RobertaConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=12,
    type_vocab_size=1
)

# Creează modelul RoBERTa pentru Masked Language Modeling
model = RobertaForMaskedLM(config)

# Convertește corpul de știri într-un format de dataset pentru MLM
dataset = Dataset.from_pandas(df)

# Tokenizare
def tokenize_function(examples):
    return tokenizer(examples["body"], padding="longest", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

# Configurare pentru Masked Language Modeling


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

# Setează parametrii de antrenare
training_args = TrainingArguments(
    output_dir="./roberta-from-scratch",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    learning_rate=5e-5,
    lr_scheduler_type="linear",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)


# Antrenare
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

Map:   0%|          | 0/296 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,No log,8.488914
2,No log,7.218965
3,No log,6.536112
4,No log,6.310741
5,No log,6.177022
6,No log,6.081075
7,No log,5.948695
8,No log,5.895613
9,No log,5.92344
10,No log,5.918226


There were missing keys in the checkpoint model loaded: ['lm_head.decoder.weight', 'lm_head.decoder.bias'].


TrainOutput(global_step=300, training_loss=6.679736735026042, metrics={'train_runtime': 513.1152, 'train_samples_per_second': 4.599, 'train_steps_per_second': 0.585, 'total_flos': 621306493378560.0, 'train_loss': 6.679736735026042, 'epoch': 10.0})

In [6]:
eval_results = trainer.evaluate()
print("Validation Loss:", eval_results["eval_loss"])
perplexity = torch.exp(torch.tensor(eval_results["eval_loss"]))
print("Perplexity:", perplexity.item())

Validation Loss: 5.957333087921143
Perplexity: 386.5777893066406


In [7]:
# Funcție pentru a genera propoziții din corpul fiecărei știri
def generate_balanced_sentence_pairs(df, max_similar_pairs=5):
    pairs = []

    for idx, row in df.iterrows():
        headline = row['headline']
        body_sentences = row['body'].split('.')  # împarte corpul în propoziții

        # Perechi similare - alege primele max_similar_pairs propoziții
        similar_pairs = [(headline, sentence.strip(), 1)
                         for sentence in body_sentences[:max_similar_pairs] if sentence.strip()]

        pairs.extend(similar_pairs)

        # Perechi diferite - generează același număr de perechi diferite
        different_pairs = 0
        for _, row_other in df.sample(len(similar_pairs)).iterrows():
            if row_other['headline'] != headline:
                other_sentence = row_other['body'].split('.')[0].strip()
                if other_sentence:
                    pairs.append((headline, other_sentence, 0))  # etichetă 0
                    different_pairs += 1
                # Oprește după max_similar_pairs perechi diferite
                if different_pairs >= len(similar_pairs):
                    break

    return pairs

# Creează perechi de propoziții
pairs = generate_balanced_sentence_pairs(df)

# Conversie la DataFrame și separare în antrenare/validare
similarity_df = pd.DataFrame(pairs, columns=["headline", "sentence", "label"])
train_df, val_df = train_test_split(similarity_df, test_size=0.2, random_state=42)

In [8]:
print("Total samples:", len(similarity_df))
print("Similar (label=1):", len(similarity_df[similarity_df["label"] == 1]))
print("Different (label=0):", len(similarity_df[similarity_df["label"] == 0]))

Total samples: 2753
Similar (label=1): 1379
Different (label=0): 1374


In [9]:
model.save_pretrained("./roberta-from-scratch")
tokenizer.save_pretrained("./roberta-from-scratch")

('./roberta-from-scratch/tokenizer_config.json',
 './roberta-from-scratch/special_tokens_map.json',
 './roberta-from-scratch/vocab.json',
 './roberta-from-scratch/merges.txt',
 './roberta-from-scratch/added_tokens.json',
 './roberta-from-scratch/tokenizer.json')

In [10]:
# Încarcă modelul RoBERTa antrenat și creează un model de tip SentenceTransformer
word_embedding_model = models.Transformer('./roberta-from-scratch', max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_max_tokens=True)
similarity_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Crează setul de antrenament pentru similaritate semantică
train_examples = [InputExample(texts=[row['headline'], row['sentence']], label=float(row['label']))
                  for _, row in train_df.iterrows()]
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
train_loss = losses.ContrastiveLoss(model=similarity_model)

# Fine-tuning pe task-ul de similaritate
similarity_model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
    output_path='./roberta-similarity-model'
)

Some weights of RobertaModel were not initialized from the model checkpoint at ./roberta-from-scratch and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.0475
1000,0.0389
1500,0.0323
2000,0.0281
2500,0.0247


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

In [11]:
# Perechi de test
headline = ""
sentence = "COVID-19 este periculos."

# Obține embedding-urile
embedding_headline = similarity_model.encode(headline, convert_to_tensor=True)
embedding_sentence = similarity_model.encode(sentence, convert_to_tensor=True)

# Calculează similaritatea cosine
cosine_score = util.cos_sim(embedding_headline, embedding_sentence)
print(headline)
print(sentence)
print("Cosine Similarity Score:", cosine_score.item())


COVID-19 este periculos.
Cosine Similarity Score: 0.5856054425239563


In [20]:
from torch.utils.data import DataLoader
import torch
from sentence_transformers import InputExample
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from sentence_transformers.readers import InputExample

# Create a custom collate function to handle InputExample objects
def collate_fn(batch):
    texts = [example.texts for example in batch]
    labels = torch.tensor([example.label for example in batch], dtype=torch.float)
    return {'texts': texts, 'label': labels}

# Generate validation data
val_examples = [InputExample(texts=[row['headline'], row['sentence']], label=float(row['label']))
                for _, row in val_df.iterrows()]

# Create a DataLoader with the custom collate_fn
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=8, collate_fn=collate_fn)

# Initialize lists for true labels and predictions
true_labels = []
predictions = []

# Iterate through the validation DataLoader
for batch in val_dataloader:
    texts = batch['texts']
    labels = batch['label'].numpy()

    # Get embeddings for the headlines and sentences
    embeddings_1 = similarity_model.encode([t[0] for t in texts], convert_to_tensor=True)
    embeddings_2 = similarity_model.encode([t[1] for t in texts], convert_to_tensor=True)

    # Compute cosine similarity
    cosine_scores = util.cos_sim(embeddings_1, embeddings_2)

    # Extract diagonal values for binary predictions
    preds = (cosine_scores.diagonal() > 0.5).cpu().numpy()  # Use diagonal for predictions

    # Debugging step: print true_labels and predictions to ensure they are binary
    print(f"True labels: {labels[:10]}")  # print first 10 true labels
    print(f"Predictions: {preds[:10]}")  # print first 10 predictions

    # Append true labels and predictions to their respective lists
    true_labels.extend(labels)
    predictions.extend(preds)

# Compute metrics
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
precision = precision_score(true_labels, predictions)
recall = recall_score(true_labels, predictions)
auc = roc_auc_score(true_labels, predictions)

# Print the metrics
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)


True labels: [0. 0. 0. 1. 1. 1. 0. 0.]
Predictions: [ True  True  True  True  True  True  True  True]
True labels: [0. 0. 0. 0. 0. 0. 1. 1.]
Predictions: [ True False  True  True  True  True  True  True]
True labels: [0. 1. 0. 1. 0. 0. 0. 0.]
Predictions: [False  True  True  True  True False  True  True]
True labels: [0. 1. 1. 0. 0. 0. 0. 1.]
Predictions: [False  True  True  True  True  True  True  True]
True labels: [1. 0. 1. 0. 0. 1. 0. 0.]
Predictions: [ True  True  True  True  True  True  True  True]
True labels: [0. 0. 0. 1. 1. 0. 1. 0.]
Predictions: [False  True False  True  True  True  True  True]
True labels: [0. 0. 1. 0. 0. 1. 0. 0.]
Predictions: [ True  True  True  True  True  True  True  True]
True labels: [0. 1. 1. 1. 1. 1. 0. 0.]
Predictions: [ True  True  True  True  True  True  True  True]
True labels: [1. 0. 1. 1. 1. 0. 1. 0.]
Predictions: [ True  True  True  True  True  True  True  True]
True labels: [0. 1. 1. 1. 0. 1. 1. 0.]
Predictions: [ True  True  True  True  True