In [None]:
import os
import time
from google.cloud import translate_v2 as translate
from tqdm import tqdm

BATCH_SIZE = 50
MAX_CHARS_PER_BATCH = 5000
SLEEP_BETWEEN_BATCHES = 1.5
TARGET_LANG = 'ky'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/zinc-primer-459307-c4-e931e1449cd5.json"

translate_client = translate.Client()

def load_labeled_lines(path):
    lines = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            if ";" in line:
                text, emotion = line.strip().split(";", 1)
                lines.append((text.strip(), emotion.strip()))
    return lines

def translate_batch(batch_texts, target=TARGET_LANG):
    if not batch_texts:
        return []
    try:
        return translate_client.translate(batch_texts, target_language=target)
    except Exception as e:
        print("❌ Ошибка при переводе:", e)
        return [{"translatedText": ""} for _ in batch_texts]

def save_translations(translated_pairs, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for text, emotion in translated_pairs:
            f.write(f"{text};{emotion}\n")

def process_file(input_path, output_path):
    print(f"\n📄 Обработка файла: {os.path.basename(input_path)}")
    lines = load_labeled_lines(input_path)
    total = len(lines)
    translated_data = []
    batch = []
    batch_emotions = []
    current_chars = 0
    translated_count = 0

    for i, (text, emotion) in enumerate(lines):
        if current_chars + len(text) > MAX_CHARS_PER_BATCH or len(batch) >= BATCH_SIZE:
            results = translate_batch(batch)
            translated_data.extend([
                (result['translatedText'], emo) for result, emo in zip(results, batch_emotions)
            ])
            translated_count += len(batch)
            print(f"✅ Переведено {translated_count} из {total} строк.")
            time.sleep(SLEEP_BETWEEN_BATCHES)
            batch = []
            batch_emotions = []
            current_chars = 0

        batch.append(text)
        batch_emotions.append(emotion)
        current_chars += len(text)

    if batch:
        results = translate_batch(batch)
        translated_data.extend([
            (result['translatedText'], emo) for result, emo in zip(results, batch_emotions)
        ])
        translated_count += len(batch)
        print(f"✅ Переведено {translated_count} из {total} строк (завершено).")

    save_translations(translated_data, output_path)
    print(f"💾 Сохранено: {len(translated_data)} строк → {output_path}")

input_folder = "/content/archive"
output_folder = "translated_kyrgyz"
os.makedirs(output_folder, exist_ok=True)

for fname in ["train.txt", "val.txt", "test.txt"]:
    process_file(
        input_path=os.path.join(input_folder, fname),
        output_path=os.path.join(output_folder, fname)
    )



📄 Обработка файла: train.txt
✅ Переведено 50 из 16000 строк.
✅ Переведено 99 из 16000 строк.
✅ Переведено 140 из 16000 строк.
✅ Переведено 188 из 16000 строк.
✅ Переведено 237 из 16000 строк.
✅ Переведено 283 из 16000 строк.
✅ Переведено 333 из 16000 строк.
✅ Переведено 383 из 16000 строк.
✅ Переведено 433 из 16000 строк.
✅ Переведено 474 из 16000 строк.
✅ Переведено 524 из 16000 строк.
✅ Переведено 574 из 16000 строк.
✅ Переведено 624 из 16000 строк.
✅ Переведено 674 из 16000 строк.
✅ Переведено 724 из 16000 строк.
✅ Переведено 768 из 16000 строк.
✅ Переведено 815 из 16000 строк.
✅ Переведено 865 из 16000 строк.
✅ Переведено 915 из 16000 строк.
✅ Переведено 965 из 16000 строк.
✅ Переведено 1015 из 16000 строк.
✅ Переведено 1058 из 16000 строк.
✅ Переведено 1108 из 16000 строк.
✅ Переведено 1155 из 16000 строк.
✅ Переведено 1205 из 16000 строк.
✅ Переведено 1253 из 16000 строк.
✅ Переведено 1303 из 16000 строк.
✅ Переведено 1353 из 16000 строк.
✅ Переведено 1400 из 16000 строк.
✅ Пере

In [6]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install numpy==1.26.4 --quiet
import os
os._exit(0)


In [1]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
from sklearn.preprocessing import LabelEncoder
import pandas as pd

MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 4
OUTPUT_DIR = "./kyrgyz-emotion-model"
VALID_LABELS = {'sadness', 'joy', 'fear', 'anger', 'love', 'surprise'}


def load_data(file_path, valid_labels):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split(";", 1)
            if len(parts) != 2:
                print(f"⚠️ Пропущена некорректная строка {i + 1}: {line.strip()}")
                continue
            text, label = parts
            text, label = text.strip(), label.strip()
            if text:
                if label not in valid_labels:
                    label = "other"
                texts.append(text)
                labels.append(label)
    return pd.DataFrame({'text': texts, 'label': labels})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

train_df = load_data("train.txt", VALID_LABELS)
val_df = load_data("val.txt", VALID_LABELS)
test_df = load_data("test.txt", VALID_LABELS)

print("🔎 Уникальные метки в train.txt:", train_df['label'].unique())
print("🔎 Уникальные метки в val.txt:", val_df['label'].unique())
print("🔎 Уникальные метки в test.txt:", test_df['label'].unique())

label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["label"])
val_df["label"] = label_encoder.transform(val_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in id2label.items()}

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_encoder.classes_),
    id2label=id2label,
    label2id=label2id
)

accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    return accuracy.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

print("Обучение завершено. Сохраняем модель...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Тестирование:")
metrics = trainer.evaluate(test_ds)
print(metrics)


🔎 Уникальные метки в train.txt: ['sadness' 'anger' 'love' 'surprise' 'fear' 'joy' 'other']
🔎 Уникальные метки в val.txt: ['sadness' 'love' 'anger' 'joy' 'fear' 'other' 'surprise']
🔎 Уникальные метки в test.txt: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise' 'other']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mzhumataevtimurr[0m ([33mzhumataevtimurr-google[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0285,0.826791,0.7245
2,0.7309,0.714552,0.763
3,0.5937,0.681712,0.7725
4,0.4806,0.712057,0.7825


Обучение завершено. Сохраняем модель...
Тестирование:


{'eval_loss': 0.7296693921089172, 'eval_accuracy': 0.7545, 'eval_runtime': 14.9058, 'eval_samples_per_second': 134.176, 'eval_steps_per_second': 8.386, 'epoch': 4.0}


In [3]:
def predict(texts):
    if isinstance(texts, str):
        texts = [texts]

    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1).cpu().numpy()
    labels = [id2label[p] for p in preds]
    return labels, probs.cpu().numpy()


In [6]:
texts = [
    "Бакыт каалайбыз,кыргыздын көп жигиттери даай албай турган арстан жүрөктүү кыз,күйөө бала азамат"
]

labels, probabilities = predict(texts)

for text, label, probs in zip(texts, labels, probabilities):
    print(f"📝 Текст: {text}")
    print(f"🔮 Предсказание: {label}")
    for i, prob in enumerate(probs):
        print(f"   {id2label[i]}: {prob:.4f}")
    print("-" * 50)

for text, label in zip(texts, labels):
    print(f"📝 Текст: {text}\n🔮 Предсказание: {label}\n")


📝 Текст: Бакыт каалайбыз,кыргыздын көп жигиттери даай албай турган арстан жүрөктүү кыз,күйөө бала азамат
🔮 Предсказание: joy
   anger: 0.0032
   fear: 0.0014
   joy: 0.9534
   love: 0.0362
   other: 0.0004
   sadness: 0.0040
   surprise: 0.0013
--------------------------------------------------
📝 Текст: Бакыт каалайбыз,кыргыздын көп жигиттери даай албай турган арстан жүрөктүү кыз,күйөө бала азамат
🔮 Предсказание: joy



In [9]:
train_df.head()


Unnamed: 0,text,label
0,өзүмдү кемсинткен сезген жокмун,5
1,Мен кам көргөн жана сергек адамдын жанында бол...,5
2,"Мен пост жазууга бир мүнөт убакыт бөлүп жатам,...",0
3,Мен каминге болгон ностальгиялык сезимде болом...,3
4,мен өзүмдү жаман сезип жатам,0
