In [None]:
import pandas as pd
import numpy as np
import random
import torch
import transformers
import torch.nn as nn
from transformers import AutoModel, BertTokenizer, BertForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import load_metric, Dataset
from sklearn.metrics import classification_report, f1_score
from transformers import pipeline

In [None]:
from sklearn.model_selection import train_test_split

dataset = pd.read_csv('/kaggle/input/datasettt/category_numbers.csv')

train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

In [None]:
train_df

In [None]:
train_df.reset_index(drop=True, inplace=True)

In [None]:
train_df

In [None]:
test_df

In [None]:
test_df.reset_index(drop=True, inplace=True)

In [None]:
test_df

In [None]:
train_text = train_df['Text']
train_labels = train_df['Category']
test_text = test_df['Text']
test_labels = test_df['Category']

In [None]:
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased-sentence', num_labels=13).to("cuda")
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-sentence')

In [None]:
seq_len_train = [len(str(i).split()) for i in train_df['Text']]
seq_len_test = [len(str(i).split()) for i in test_df['Text']]
max_seq_len = max(max(seq_len_test), max(seq_len_train))
max_seq_len

In [None]:
tokens_train = tokenizer.batch_encode_plus(
    train_text.values,
    max_length = 512,
    padding = 'max_length',
    truncation = True
)
tokens_test = tokenizer.batch_encode_plus(
    test_text.values,
    max_length = 512,
    padding = 'max_length',
    truncation = True
)

In [None]:
class Data(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item
    def __len__(self):
        return len(self.labels)
    
train_dataset = Data(tokens_train, train_labels)
test_dataset = Data(tokens_test, test_labels)

In [None]:
from sklearn.metrics import f1_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    
    average_type = 'weighted'
    
    f1 = f1_score(labels, preds, average=average_type)
    return {'F1': f1}

In [None]:
training_args = TrainingArguments(
    output_dir = '/kaggle/working/results',
    num_train_epochs = 10,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    weight_decay =0.01,
    logging_dir = '/kaggle/working/logs',
    load_best_model_at_end = True,
    learning_rate = 1e-5,
    evaluation_strategy ='epoch',
    logging_strategy = 'epoch',
    save_strategy = 'epoch',
    save_total_limit = 1,
    seed=21)

In [None]:
trainer = Trainer(model=model,
                  tokenizer = tokenizer,
                  args = training_args,
                  train_dataset = train_dataset,
                  eval_dataset = train_dataset,
                  compute_metrics = compute_metrics)

In [None]:
trainer.train()

In [None]:
model_path = "fine-tune-bert3"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
def get_prediction():
    test_pred = trainer.predict(test_dataset)
    labels = np.argmax(test_pred.predictions, axis = -1)
    return labels
pred = get_prediction()

In [None]:
average_type = 'weighted'
print(classification_report(test_labels, pred))
print(f1_score(test_labels, pred, average=average_type))

In [None]:
def predict_text(text, model, tokenizer):
    
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    inputs = {k: v.to("cuda") for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    predicted_class_idx = torch.argmax(outputs.logits, dim=1).item()
    
    class_to_label = {0: 'Healthcare', 1: 'Housing_and_Public_Utilities', 2: 'Education', 3: 'Infrastructure', 4: 'Culture', 5: 'Environmental_Conditions', 6: 'Social_Security', 7: 'Politics', 8: 'Safety', 9: 'Availability_of_Goods_and_Services', 10: 'Official_Statements', 11: 'Tourism', 12: 'Facts'}
    predicted_label = class_to_label[predicted_class_idx]
    
    return predicted_label

text = "Ваня получил пятерку по алгебре в школе"
predicted_label = predict_text(text, model, tokenizer)
print(f"Предсказанная метка: {predicted_label}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = test_dataset.labels

class_names = ['Healthcare', 'Housing_and_Public_Utilities', 'Education', 'Infrastructure', 'Culture', 'Environmental_Conditions', 'Social_Security', 'Politics', 'Safety', 'Availability_of_Goods_and_Services', 'Official_Statements', 'Tourism', 'Facts']

# График матрицы ошибок
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

# График отчета о классификации
plt.figure(figsize=(12, 6))
report_df.plot(kind='bar', figsize=(12, 6))
plt.title('Classification Report')
plt.xlabel('Classes')
plt.ylabel('Scores')
plt.xticks(rotation=45, ha='right')
plt.legend(loc='lower right')
plt.grid(axis='y')
plt.tight_layout()
plt.show()