In [1]:
import pandas as pd
import re
import torch
import warnings
warnings.filterwarnings('ignore', category=UserWarning, module='huggingface_hub')
from datasets import load_dataset
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
%env WANDB_DISABLED=true

env: WANDB_DISABLED=true


In [2]:
train_url = "https://raw.githubusercontent.com/sismetanin/rusentitweet/refs/heads/main/rusentitweet_train.csv"
test_url  = "https://raw.githubusercontent.com/sismetanin/rusentitweet/refs/heads/main/rusentitweet_test.csv"

train_df = pd.read_csv(train_url)
test_df  = pd.read_csv(test_url)

In [3]:
skip_labels = ['speech', 'skip']
train_df = train_df[~train_df['label'].isin(skip_labels)]
test_df  = test_df[~test_df['label'].isin(skip_labels)]

In [4]:
mapping = {
    "negative": 0,
    "neutral": 1,
    "positive": 2
}
train_df["labels"] = train_df["label"].map(mapping)
test_df["labels"]  = test_df["label"].map(mapping)

In [5]:
train_df.head()

Unnamed: 0,text,label,id,labels
0,Помойму я вкрашилась в Чимина🤧 https://t.co/t2...,positive,1282311169534038016,2
1,@namaskaramsaroo Мотоцикль,neutral,1272864221202530309,1
2,Михаил Мишустин: меры по борьбе с коронавирусо...,neutral,1296860899739947008,1
4,ну что пойду чекну фоточки,neutral,1287678712612364288,1
5,@buybread_ я не с порядке!!!!,negative,1335130757044563971,0


In [6]:
train_df.drop(['id', 'label'], axis=1, inplace=True)
test_df.drop(['id', 'label'], axis=1, inplace=True)

In [7]:
train_df.head()

Unnamed: 0,text,labels
0,Помойму я вкрашилась в Чимина🤧 https://t.co/t2...,2
1,@namaskaramsaroo Мотоцикль,1
2,Михаил Мишустин: меры по борьбе с коронавирусо...,1
4,ну что пойду чекну фоточки,1
5,@buybread_ я не с порядке!!!!,0


In [8]:
test_df.head()

Unnamed: 0,text,labels
1,я считаю это мем года https://t.co/xoVKj5y8Mj,2
2,ян русский на сотку все запятые где надо🤙🏻👍🏻👍🏻...,2
4,@daria_karapet * терияки бойз начинает играть*,1
5,(пушка на Караульной горе больше не стреляет Б...,0
6,@Iori_loves_U Как мило /смутилась/ спасибо 🥰🌸,2


In [9]:
def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^а-яё\s]', '', text)

    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()

    return ' '.join(words)

In [10]:
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)

In [11]:
train_df.head()

Unnamed: 0,text,labels
0,помойму я вкрашилась в чимина,2
1,мотоцикль,1
2,михаил мишустин меры по борьбе с коронавирусом...,1
4,ну что пойду чекну фоточки,1
5,я не с порядке,0


In [12]:
model_checkpoint = "blanchefort/rubert-base-cased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)
if torch.cuda.is_available():
    model.cuda()

tokenizer_config.json:   0%|          | 0.00/499 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/943 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [13]:
def estimate_sentiment(messages):
    scores = []

    for text in tqdm(messages):
        with torch.no_grad():
            inputs = tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                padding=True,
                max_length=512).to(model.device)

            outputs = model(**inputs)
            proba = torch.softmax(outputs.logits, dim=-1)[0].cpu().numpy()
            p_neg = proba[2]
            p_neu = proba[0]
            p_pos = proba[1]

            score = p_pos - p_neg


            # convert to 1..10
            score_10 = round((score + 1) * 4.5 + 1)
            score_10 = max(1, min(10, score_10))

            scores.append(score_10)

    return scores

In [14]:
def convert_10_to_3_v2(scores_10, method='balanced'):
    """
    Разные методы преобразования
    """
    scores_3 = []
    for score in scores_10:
        if method == 'balanced':
            # Текущие границы
            if score <= 3: scores_3.append(0)
            elif score <= 6: scores_3.append(1)
            else: scores_3.append(2)
        elif method == 'sensitive':
            # Более чувствительный к негативу
            if score <= 4: scores_3.append(0)
            elif score <= 7: scores_3.append(1)
            else: scores_3.append(2)
        elif method == 'strict':
            # Строгие границы
            if score <= 2: scores_3.append(0)
            elif score <= 5: scores_3.append(1)
            else: scores_3.append(2)
    return scores_3


In [15]:
N = len(test_df)
sentiments = estimate_sentiment(test_df['text'].tolist()[:N])
true_labels = test_df['labels'].tolist()[:N]

for method in ['balanced', 'sensitive', 'strict']:
    test_labels = convert_10_to_3_v2(sentiments, method)
    acc = accuracy_score(true_labels, test_labels)
    print(f"Method {method}: Accuracy = {acc:.3f}")

  0%|          | 0/2211 [00:00<?, ?it/s]

Method balanced: Accuracy = 0.626
Method sensitive: Accuracy = 0.626
Method strict: Accuracy = 0.375


In [None]:
train_df.shape

(8842, 2)

In [None]:
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

In [None]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

In [None]:
train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)


Map:   0%|          | 0/8842 [00:00<?, ? examples/s]

Map:   0%|          | 0/2211 [00:00<?, ? examples/s]

In [None]:
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


In [None]:
args = TrainingArguments(
    output_dir="./sentiment_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    report_to="none",
    logging_steps=50,
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.from_numpy(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, preds)}


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8165,0.818935,0.67526
2,0.7504,0.787306,0.69109
3,0.6901,0.789513,0.692899
4,0.6465,0.812791,0.69109


TrainOutput(global_step=2212, training_loss=0.7258769582020654, metrics={'train_runtime': 1167.3093, 'train_samples_per_second': 30.299, 'train_steps_per_second': 1.895, 'total_flos': 2326448839550976.0, 'train_loss': 0.7258769582020654, 'epoch': 4.0})

In [18]:
N = len(test_df)
sentiments = estimate_sentiment(test_df['text'].tolist()[:N])
true_labels = test_df['labels'].tolist()[:N]

for method in ['balanced', 'sensitive', 'strict']:
    test_labels = convert_10_to_3_v2(sentiments, method)
    acc = accuracy_score(true_labels, test_labels)
    print(f"Method {method}: Accuracy = {acc:.3f}")

  0%|          | 0/2211 [00:00<?, ?it/s]

Method balanced: Accuracy = 0.697
Method sensitive: Accuracy = 0.697
Method strict: Accuracy = 0.446


In [19]:
true_labels = convert_10_to_3_v2(sentiments, 'sensitive')

In [20]:
dataset_id2label = {
    0: "NEGATIVE",
    1: "NEUTRAL",
    2: "POSITIVE"
}

for i in range(4, 8):
    text = test_df.loc[i, "text"]
    true_id = true_labels[i]
    pred_id = test_labels[i]

    print(f"Example #{i+1}")
    print(f"Text: {text}")
    print(f"True label: {true_id} → {dataset_id2label[true_id]}")
    print(f"Pred label: {pred_id} → {dataset_id2label[pred_id]}")
    print("-" * 60)


Example #5
Text: терияки бойз начинает играть
True label: 2 → POSITIVE
Pred label: 2 → POSITIVE
------------------------------------------------------------
Example #6
Text: пушка на караульной горе больше не стреляет бабах красноярск
True label: 1 → NEUTRAL
Pred label: 2 → POSITIVE
------------------------------------------------------------
Example #7
Text: как мило смутилась спасибо
True label: 0 → NEGATIVE
Pred label: 0 → NEGATIVE
------------------------------------------------------------
Example #8
Text: по джакурамам это паблик нарочан а моё это просто по джакураю
True label: 2 → POSITIVE
Pred label: 2 → POSITIVE
------------------------------------------------------------
