In [None]:
!pip install transformers==4.51.3



In [3]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
import torch

print("TrainingArguments from:", TrainingArguments.__module__)

import transformers
print("Transformers version:", transformers.__version__)

df = pd.read_csv("/content/combined_dataset.csv")  # text, emotion

# 2. Кодируем метки
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Emotion"])
num_labels = len(label_encoder.classes_)

# 3. Преобразуем в HuggingFace Dataset
dataset = Dataset.from_pandas(df[["text", "label"]])

# 4. Загружаем токенизатор и модель
model_name = "xlm-roberta-base"  # Поддерживает кыргызский
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 5. Токенизация
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.train_test_split(test_size=0.1)  # 90% train / 10% test

print(TrainingArguments.__module__)
# 6. Параметры обучения
training_args = TrainingArguments(
    output_dir="./emotion-model-ky",
    eval_strategy="epoch",  # Изменено evaluation_strategy на eval_strategy
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    save_total_limit=2,
)

# 7. Тренер
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer
)

# 8. Обучение
trainer.train()

# 9. Сохраняем модель и токенизатор
model_path = "./emotion-model-ky"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# 10. Сохраняем лейбл энкодер для последующего использования
import pickle
with open(f"{model_path}/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print(f"Модель сохранена в: {model_path}")

TrainingArguments from: transformers.training_args
Transformers version: 4.51.3


FileNotFoundError: [Errno 2] No such file or directory: '/content/combined_dataset.csv'

In [4]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
import torch
import pickle

# 1. Загружаем сохранённую модель и токенизатор
model_path = "./emotion-model-ky"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # В режим инференса

# Загрузка LabelEncoder
with open(f"{model_path}/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# 2. Загружаем новый датасет
df_new = pd.read_csv("/content/combined_dataset.csv")  # Новый датасет с текстами и эмоциями


# 3. Кодируем метки в новом датасете
df_new["label"] = label_encoder.transform(df_new["Emotion"])

# Преобразуем в HuggingFace Dataset
new_dataset = Dataset.from_pandas(df_new[["text", "label"]])

# 4. Токенизация
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

new_dataset = new_dataset.map(tokenize, batched=True)

# 5. Разделение на train/test
new_dataset = new_dataset.train_test_split(test_size=0.1)

# 6. Параметры для дообучения
training_args = TrainingArguments(
    output_dir="./emotion-model-ky",  # Выходная директория, где сохраняется модель
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    num_train_epochs=10,  # Количество эпох (можно изменить)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    save_total_limit=2,
)

# 7. Trainer для дообучения
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=new_dataset["train"],
    eval_dataset=new_dataset["test"],
    tokenizer=tokenizer
)

# 8. Дообучение
trainer.train()

# 9. Сохраняем дообученную модель
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# 10. Сохраняем LabelEncoder
with open(f"{model_path}/label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print(f"Модель успешно дообучена и сохранена в: {model_path}")

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './emotion-model-ky'.

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pickle

# Путь к сохранённой модели
model_path = "./emotion-model-ky"

# Загрузка модели и токенизатора
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()  # В режим инференса

# Загрузка LabelEncoder
with open(f"{model_path}/label_encoder.pkl", "rb") as f:
    label_encoder = pickle.load(f)

# Функция для предсказания эмоции
def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=1).item()
        emotion_label = label_encoder.inverse_transform([predicted_class_id])[0]
    return emotion_label

# 🔍 Пример проверки
example_text = "Ыйлагым келет"
predicted_emotion = predict_emotion(example_text)
print(f"Кыргызский текст: {example_text}")
print(f"Предсказанная эмоция: {predicted_emotion}")

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './emotion-model-ky'.