<a href="https://colab.research.google.com/github/path-0f-misantrope/most_uselles_musor_final/blob/main/modellll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install scikit-learn


In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


In [None]:
df = pd.read_csv('/content/разметка_готовая.csv')
df.info()

In [None]:

df.loc[
    (df["Понравилось выполнение заявки"] == 1) &
    (df["Нравится качество выполнения заявки"] == 0),
    "Нравится качество выполнения заявки"
] = 1

# Удаляем колонку "Понравилось выполнение заявки"
df.drop(columns=["Понравилось выполнение заявки"], inplace=True)


In [None]:
df.info()

In [None]:

text_col = 'comment'
label_cols = [
    'Нравится скорость отработки заявок',
    'Нравится качество выполнения заявки',
    'Нравится качество работы сотрудников',
    'Вопрос решен'
]
df = df[[text_col] + label_cols].dropna()
df = df[df[label_cols].sum(axis=1) > 0]

In [None]:
df.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df[text_col].tolist(),
    df[label_cols].values,
    test_size=0.2,
    random_state=52
)


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

class CommentDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(self.texts[idx],
                             truncation=True,
                             padding='max_length',
                             max_length=128,
                             return_tensors="pt")
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = CommentDataset(X_train, y_train)
test_dataset = CommentDataset(X_test, y_test)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    #evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()


In [None]:
preds = trainer.predict(test_dataset)
y_pred = (preds.predictions > 0.5).astype(int)
print(classification_report(y_test, y_pred, target_names=label_cols))



```
                precision    recall  f1-score   support

  Нравится скорость отработки заявок       0.99      0.96      0.97       162
 Нравится качество выполнения заявки       0.81      0.78      0.80        55
Нравится качество работы сотрудников       0.73      0.51      0.60        43
                        Вопрос решен       0.96      0.97      0.97       250

                           micro avg       0.94      0.91      0.92       510
                           macro avg       0.87      0.81      0.83       510
                        weighted avg       0.94      0.91      0.92       510
                         samples avg       0.94      0.92      0.92       510

```



In [None]:
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()

    # Binarize с порогом 0.5
    preds = (probs >= 0.5).astype(int)

    auc = roc_auc_score(labels, probs, average="macro")
    f1 = f1_score(labels, preds, average="macro")
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "roc_auc": auc
    }
print(compute_metrics((preds.predictions, y_test)))

'accuracy': 0.8246153846153846, 'f1': 0.8460273006101483, 'roc_auc': np.float64(0.952922541104103)
