In [None]:
!pip install -q transformers datasets torch scikit-learn spacy pandas accelerate
!python -m spacy download uk_core_news_sm
!python -m spacy download ru_core_news_sm
!python -m spacy download en_core_web_sm

import pandas as pd
import numpy as np
import torch
import spacy
from torch import nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset

Collecting uk-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/uk_core_news_sm-3.8.0/uk_core_news_sm-3.8.0-py3-none-any.whl (14.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from uk-core-news-sm==3.8.0)
  Downloading pymorphy3-2.0.6-py3-none-any.whl.metadata (2.4 kB)
Collecting pymorphy3-dicts-uk (from uk-core-news-sm==3.8.0)
  Downloading pymorphy3_dicts_uk-2.4.1.1.1663094765-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->uk-core-news-sm==3.8.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->uk-core-news-sm==3.8.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.6-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
df = pd.read_parquet("final_data.parquet")

df = df.dropna(subset=['content', 'manipulative'])
df['content'] = df['content'].astype(str)
df['label'] = df['manipulative'].astype(int)

print("Class distribution:\n", df['label'].value_counts())

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['content'].tolist(),
    df['label'].tolist(),
    test_size=0.15,
    random_state=42,
    shuffle = True,
    stratify=df['label']
)

model_name = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
val_dataset = NewsDataset(val_texts, val_labels, tokenizer)
raw_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
smooth_weights = np.sqrt(raw_weights)
weights_tensor = torch.tensor(smooth_weights, dtype=torch.float).to(device)
print(f"Calculated Class Weights: {smooth_weights}")

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,

    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    logging_steps=100,

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("Starting training...")
trainer.train()

output_dir = "./models/en"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Using device: cuda
Class distribution:
 label
1    28267
0     9486
Name: count, dtype: int64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Calculated Class Weights: [1.41065733 0.81718444]


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.565,0.469037,0.830479,0.890336,0.863314,0.919104
1000,0.4423,0.441924,0.853081,0.904125,0.883957,0.925236
1500,0.4192,0.378902,0.866502,0.912113,0.899358,0.925236
2000,0.4148,0.393766,0.830832,0.881377,0.927789,0.839387
2500,0.3409,0.455193,0.868974,0.916648,0.875161,0.962264
3000,0.3458,0.377589,0.857143,0.902729,0.920775,0.885377
3500,0.3251,0.3897,0.878333,0.920319,0.902882,0.938443
4000,0.3411,0.360058,0.865796,0.909394,0.919479,0.899528
4500,0.2512,0.451363,0.86403,0.907873,0.921321,0.894811
5000,0.2955,0.411832,0.865442,0.909178,0.919036,0.899528


('./models/en/tokenizer_config.json',
 './models/en/special_tokens_map.json',
 './models/en/sentencepiece.bpe.model',
 './models/en/added_tokens.json')

In [None]:
import shutil
from google.colab import files

# 1. Створюємо архів zip з папки models/en
shutil.make_archive('my_model_archive', 'zip', './models/en')

# 2. Скачуємо архів на ноутбук
files.download('my_model_archive.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
nlp_models = {
    'uk': spacy.load("uk_core_news_sm"),
    'ru': spacy.load("ru_core_news_sm"),
    'en': spacy.load("en_core_web_sm"),
    'default': spacy.load("en_core_web_sm")
}

def analyze_news(text, lang='uk'):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    confidence, predicted_class = torch.max(probs, dim=1)

    is_manipulative = bool(predicted_class.item())

    nlp = nlp_models.get(lang, nlp_models['default'])
    doc = nlp(text)
    entities = list(set([(ent.text, ent.label_) for ent in doc.ents]))

    return {
        "text": text[:50] + "...",
        "status": "FAKE/MANIPULATION" if is_manipulative else "TRUTH",
        "confidence": f"{confidence.item():.4f}",
        "entities": entities
    }
print("\n--- DEMO ---")
examples = [
    ("The White House announced new taxes for everyone.", "en"),
    ("В Україні скасували воєнний стан, всім розійтися.", "uk"),
    ("Ученые доказали, что земля плоская.", "ru"),
    ("Росія серйозно пошкодила ТЕС у різних областях", "uk")
]

for t, l in examples:
    print(analyze_news(t, l))

In [None]:
from google.colab import files
import shutil
shutil.make_archive("results", 'zip', "results")
files.download("results.zip")

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
current_weights_tensor = None

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss(weight=current_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

df = pd.read_parquet("final_data.parquet")
df = df.dropna(subset=['content', 'manipulative', 'lang'])

def train_language_model(language_code, model_name, output_dir, patience=3):
    global current_weights_tensor

    print(f"\n{'='*40}")
    print(f"TRAINING: {language_code.upper()} | Patience: {patience}")
    print(f"{'='*40}")

    subset = df[df['lang'] == language_code].copy()
    subset['label'] = subset['manipulative'].astype(int)

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        subset['content'].tolist(),
        subset['label'].tolist(),
        test_size=0.15,
        random_state=42,
        shuffle=True,
        stratify=subset['label']
    )

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = NewsDataset(train_texts, train_labels, tokenizer)
    val_dataset = NewsDataset(val_texts, val_labels, tokenizer)
    raw_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    current_weights_tensor = torch.tensor(np.sqrt(raw_weights), dtype=torch.float).to(device)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_steps=200,

        # Vital for Early Stopping:
        eval_strategy="steps",
        eval_steps=200,
        save_strategy="steps",
        save_steps=200,

        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=2,
        report_to="none"
    )

    # 5. Trainer with Callback
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[
            EarlyStoppingCallback(early_stopping_patience=patience)
        ]
    )

    trainer.train()
    return model, tokenizer


trained_models = {}

trained_models['en'] = train_language_model(
    'en', 'roberta-base', './results_en', patience=3
)

trained_models['uk'] = train_language_model(
    'uk', 'youscan/ukr-roberta-base', './results_uk', patience=3
)

nlp_tools = {
    'uk': spacy.load("uk_core_news_sm"),
    'en': spacy.load("en_core_web_sm"),
    'default': spacy.load("en_core_web_sm")
}

def analyze_news_smart(text, lang):
    if lang in trained_models:
        model, tokenizer = trained_models[lang]
    else:
        model, tokenizer = trained_models['en']

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    confidence, predicted_class = torch.max(probs, dim=1)
    is_manipulative = bool(predicted_class.item())

    return {
        "lang": lang,
        "text": text[:50] + "...",
        "status": "FAKE" if is_manipulative else "TRUTH",
        "confidence": f"{confidence.item():.4f}"
    }

print("\n--- FINAL DEMO ---")
print(analyze_news_smart("Breaking: Taxes will remain unchanged.", "en"))
print(analyze_news_smart("Терміново! Всім видадуть по 10 тисяч!", "uk"))