In [None]:
import os
import time
from google.cloud import translate_v2 as translate
from tqdm import tqdm

BATCH_SIZE = 50
MAX_CHARS_PER_BATCH = 5000
SLEEP_BETWEEN_BATCHES = 1.5
TARGET_LANG = 'ky'

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/content/zinc-primer-459307-c4-e931e1449cd5.json"

translate_client = translate.Client()

def load_labeled_lines(path):
    lines = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            if ";" in line:
                text, emotion = line.strip().split(";", 1)
                lines.append((text.strip(), emotion.strip()))
    return lines

def translate_batch(batch_texts, target=TARGET_LANG):
    if not batch_texts:
        return []
    try:
        return translate_client.translate(batch_texts, target_language=target)
    except Exception as e:
        print("‚ùå –û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–µ—Ä–µ–≤–æ–¥–µ:", e)
        return [{"translatedText": ""} for _ in batch_texts]

def save_translations(translated_pairs, output_path):
    with open(output_path, 'w', encoding='utf-8') as f:
        for text, emotion in translated_pairs:
            f.write(f"{text};{emotion}\n")

def process_file(input_path, output_path):
    print(f"\nüìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ñ–∞–π–ª–∞: {os.path.basename(input_path)}")
    lines = load_labeled_lines(input_path)
    total = len(lines)
    translated_data = []
    batch = []
    batch_emotions = []
    current_chars = 0
    translated_count = 0

    for i, (text, emotion) in enumerate(lines):
        if current_chars + len(text) > MAX_CHARS_PER_BATCH or len(batch) >= BATCH_SIZE:
            results = translate_batch(batch)
            translated_data.extend([
                (result['translatedText'], emo) for result, emo in zip(results, batch_emotions)
            ])
            translated_count += len(batch)
            print(f"‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ {translated_count} –∏–∑ {total} —Å—Ç—Ä–æ–∫.")
            time.sleep(SLEEP_BETWEEN_BATCHES)
            batch = []
            batch_emotions = []
            current_chars = 0

        batch.append(text)
        batch_emotions.append(emotion)
        current_chars += len(text)

    if batch:
        results = translate_batch(batch)
        translated_data.extend([
            (result['translatedText'], emo) for result, emo in zip(results, batch_emotions)
        ])
        translated_count += len(batch)
        print(f"‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ {translated_count} –∏–∑ {total} —Å—Ç—Ä–æ–∫ (–∑–∞–≤–µ—Ä—à–µ–Ω–æ).")

    save_translations(translated_data, output_path)
    print(f"üíæ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: {len(translated_data)} —Å—Ç—Ä–æ–∫ ‚Üí {output_path}")

input_folder = "/content/archive"
output_folder = "translated_kyrgyz"
os.makedirs(output_folder, exist_ok=True)

for fname in ["train.txt", "val.txt", "test.txt"]:
    process_file(
        input_path=os.path.join(input_folder, fname),
        output_path=os.path.join(output_folder, fname)
    )



üìÑ –û–±—Ä–∞–±–æ—Ç–∫–∞ —Ñ–∞–π–ª–∞: train.txt
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 50 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 99 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 140 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 188 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 237 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 283 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 333 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 383 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 433 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 474 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 524 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 574 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 624 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 674 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 724 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 768 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 815 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–µ–Ω–æ 865 –∏–∑ 16000 —Å—Ç—Ä–æ–∫.
‚úÖ –ü–µ—Ä–µ–≤–µ–¥–

In [6]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install numpy==1.26.4 --quiet
import os
os._exit(0)


In [1]:
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate
from sklearn.preprocessing import LabelEncoder
import pandas as pd

MODEL_NAME = "bert-base-multilingual-cased"
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 4
OUTPUT_DIR = "./kyrgyz-emotion-model"
VALID_LABELS = {'sadness', 'joy', 'fear', 'anger', 'love', 'surprise'}


def load_data(file_path, valid_labels):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split(";", 1)
            if len(parts) != 2:
                print(f"‚ö†Ô∏è –ü—Ä–æ–ø—É—â–µ–Ω–∞ –Ω–µ–∫–æ—Ä—Ä–µ–∫—Ç–Ω–∞—è —Å—Ç—Ä–æ–∫–∞ {i + 1}: {line.strip()}")
                continue
            text, label = parts
            text, label = text.strip(), label.strip()
            if text:
                if label not in valid_labels:
                    label = "other"
                texts.append(text)
                labels.append(label)
    return pd.DataFrame({'text': texts, 'label': labels})

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

train_df = load_data("train.txt", VALID_LABELS)
val_df = load_data("val.txt", VALID_LABELS)
test_df = load_data("test.txt", VALID_LABELS)

print("üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ train.txt:", train_df['label'].unique())
print("üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ val.txt:", val_df['label'].unique())
print("üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ test.txt:", test_df['label'].unique())

label_encoder = LabelEncoder()
train_df["label"] = label_encoder.fit_transform(train_df["label"])
val_df["label"] = label_encoder.transform(val_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

id2label = {i: label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: i for i, label in id2label.items()}

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_ds = train_ds.map(tokenize_function, batched=True)
val_ds = val_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

model = BertForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(label_encoder.classes_),
    id2label=id2label,
    label2id=label2id
)

accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    return accuracy.compute(predictions=preds, references=p.label_ids)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

print("–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ. –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ:")
metrics = trainer.evaluate(test_ds)
print(metrics)


üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ train.txt: ['sadness' 'anger' 'love' 'surprise' 'fear' 'joy' 'other']
üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ val.txt: ['sadness' 'love' 'anger' 'joy' 'fear' 'other' 'surprise']
üîé –£–Ω–∏–∫–∞–ª—å–Ω—ã–µ –º–µ—Ç–∫–∏ –≤ test.txt: ['sadness' 'joy' 'fear' 'anger' 'love' 'surprise' 'other']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mzhumataevtimurr[0m ([33mzhumataevtimurr-google[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,1.0285,0.826791,0.7245
2,0.7309,0.714552,0.763
3,0.5937,0.681712,0.7725
4,0.4806,0.712057,0.7825


–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ. –°–æ—Ö—Ä–∞–Ω—è–µ–º –º–æ–¥–µ–ª—å...
–¢–µ—Å—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ:


{'eval_loss': 0.7296693921089172, 'eval_accuracy': 0.7545, 'eval_runtime': 14.9058, 'eval_samples_per_second': 134.176, 'eval_steps_per_second': 8.386, 'epoch': 4.0}


In [3]:
def predict(texts):
    if isinstance(texts, str):
        texts = [texts]

    inputs = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        preds = torch.argmax(probs, dim=-1).cpu().numpy()
    labels = [id2label[p] for p in preds]
    return labels, probs.cpu().numpy()


In [6]:
texts = [
    "–ë–∞–∫—ã—Ç –∫–∞–∞–ª–∞–π–±—ã–∑,–∫—ã—Ä–≥—ã–∑–¥—ã–Ω –∫”©–ø –∂–∏–≥–∏—Ç—Ç–µ—Ä–∏ –¥–∞–∞–π –∞–ª–±–∞–π —Ç—É—Ä–≥–∞–Ω –∞—Ä—Å—Ç–∞–Ω –∂“Ø—Ä”©–∫—Ç“Ø“Ø –∫—ã–∑,–∫“Ø–π”©”© –±–∞–ª–∞ –∞–∑–∞–º–∞—Ç"
]

labels, probabilities = predict(texts)

for text, label, probs in zip(texts, labels, probabilities):
    print(f"üìù –¢–µ–∫—Å—Ç: {text}")
    print(f"üîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ: {label}")
    for i, prob in enumerate(probs):
        print(f"   {id2label[i]}: {prob:.4f}")
    print("-" * 50)

for text, label in zip(texts, labels):
    print(f"üìù –¢–µ–∫—Å—Ç: {text}\nüîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ: {label}\n")


üìù –¢–µ–∫—Å—Ç: –ë–∞–∫—ã—Ç –∫–∞–∞–ª–∞–π–±—ã–∑,–∫—ã—Ä–≥—ã–∑–¥—ã–Ω –∫”©–ø –∂–∏–≥–∏—Ç—Ç–µ—Ä–∏ –¥–∞–∞–π –∞–ª–±–∞–π —Ç—É—Ä–≥–∞–Ω –∞—Ä—Å—Ç–∞–Ω –∂“Ø—Ä”©–∫—Ç“Ø“Ø –∫—ã–∑,–∫“Ø–π”©”© –±–∞–ª–∞ –∞–∑–∞–º–∞—Ç
üîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ: joy
   anger: 0.0032
   fear: 0.0014
   joy: 0.9534
   love: 0.0362
   other: 0.0004
   sadness: 0.0040
   surprise: 0.0013
--------------------------------------------------
üìù –¢–µ–∫—Å—Ç: –ë–∞–∫—ã—Ç –∫–∞–∞–ª–∞–π–±—ã–∑,–∫—ã—Ä–≥—ã–∑–¥—ã–Ω –∫”©–ø –∂–∏–≥–∏—Ç—Ç–µ—Ä–∏ –¥–∞–∞–π –∞–ª–±–∞–π —Ç—É—Ä–≥–∞–Ω –∞—Ä—Å—Ç–∞–Ω –∂“Ø—Ä”©–∫—Ç“Ø“Ø –∫—ã–∑,–∫“Ø–π”©”© –±–∞–ª–∞ –∞–∑–∞–º–∞—Ç
üîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ: joy



In [9]:
train_df.head()


Unnamed: 0,text,label
0,”©–∑“Ø–º–¥“Ø –∫–µ–º—Å–∏–Ω—Ç–∫–µ–Ω —Å–µ–∑–≥–µ–Ω –∂–æ–∫–º—É–Ω,5
1,–ú–µ–Ω –∫–∞–º –∫”©—Ä–≥”©–Ω –∂–∞–Ω–∞ —Å–µ—Ä–≥–µ–∫ –∞–¥–∞–º–¥—ã–Ω –∂–∞–Ω—ã–Ω–¥–∞ –±–æ–ª...,5
2,"–ú–µ–Ω –ø–æ—Å—Ç –∂–∞–∑—É—É–≥–∞ –±–∏—Ä –º“Ø–Ω”©—Ç —É–±–∞–∫—ã—Ç –±”©–ª“Ø–ø –∂–∞—Ç–∞–º,...",0
3,–ú–µ–Ω –∫–∞–º–∏–Ω–≥–µ –±–æ–ª–≥–æ–Ω –Ω–æ—Å—Ç–∞–ª—å–≥–∏—è–ª—ã–∫ —Å–µ–∑–∏–º–¥–µ –±–æ–ª–æ–º...,3
4,–º–µ–Ω ”©–∑“Ø–º–¥“Ø –∂–∞–º–∞–Ω —Å–µ–∑–∏–ø –∂–∞—Ç–∞–º,0
