In [None]:
import os
import gc
import pickle
import time
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup
)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, accuracy_score

class Config:
    DATA_PATH = "posts.parquet"
    MODEL_SAVE_PATH = "./model"
    TOKENIZER_SAVE_PATH = "./model"
    LABEL_ENCODER_PATH = "./model/mlb.pkl"

    MODEL_NAME = "cointegrated/rubert-tiny2"
    MAX_LENGTH = 256
    BATCH_SIZE = 16
    EPOCHS = 3
    LEARNING_RATE = 2e-5
    WARMUP_STEPS = 100


    SEED = 42
    NUM_WORKERS = min(4, os.cpu_count())
    FP16 = torch.cuda.is_available()

torch.manual_seed(Config.SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_and_preprocess_data():
    print("🔄 Загрузка данных...")
    df = pd.read_parquet(Config.DATA_PATH)

    df["text"] = df["Text"].fillna("").astype(str)
    df["labels"] = df["categoryname"].apply(
    lambda x: [x] if isinstance(x, str) else (x if isinstance(x, list) else [])
)

    mlb = MultiLabelBinarizer()
    y = mlb.fit_transform(df["labels"])

    os.makedirs(Config.MODEL_SAVE_PATH, exist_ok=True)
    with open(Config.LABEL_ENCODER_PATH, "wb") as f:
        pickle.dump(mlb, f)

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df["text"].tolist(),
        y,
        test_size=0.2,
        random_state=Config.SEED
    )

    del df
    gc.collect()

    return train_texts, val_texts, train_labels, val_labels, mlb

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=Config.MAX_LENGTH,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.FloatTensor(self.labels[idx])
        }

def initialize_model(num_labels):
    print("🔄 Инициализация модели...")
    model = BertForSequenceClassification.from_pretrained(
        Config.MODEL_NAME,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )

    if torch.cuda.device_count() > 1:
        print(f"🚀 Используем {torch.cuda.device_count()} GPU!")
        model = torch.nn.DataParallel(model)

    model.to(device)
    return model

def train_model():
    train_texts, val_texts, train_labels, val_labels, mlb = load_and_preprocess_data()

    tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME)

    train_dataset = TextDataset(train_texts, train_labels, tokenizer)
    val_dataset = TextDataset(val_texts, val_labels, tokenizer)

    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=Config.NUM_WORKERS,
        pin_memory=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=False,
        num_workers=Config.NUM_WORKERS,
        pin_memory=True
    )

    model = initialize_model(len(mlb.classes_))
    optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE)

    total_steps = len(train_loader) * Config.EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=Config.WARMUP_STEPS,
        num_training_steps=total_steps
    )

    scaler = torch.cuda.amp.GradScaler(enabled=Config.FP16)

    print("🚀 Начало обучения...")
    for epoch in range(Config.EPOCHS):
        model.train()
        epoch_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{Config.EPOCHS}")

        for batch in progress_bar:
            optimizer.zero_grad()

            with torch.cuda.amp.autocast(enabled=Config.FP16):
                outputs = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=batch["attention_mask"].to(device),
                    labels=batch["labels"].to(device)
                )
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            epoch_loss += loss.item()
            progress_bar.set_postfix({"loss": loss.item()})

        model.eval()
        val_loss = 0
        predictions = []
        true_labels = []

        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                outputs = model(
                    input_ids=batch["input_ids"].to(device),
                    attention_mask=batch["attention_mask"].to(device),
                    labels=batch["labels"].to(device)
                )
                val_loss += outputs.loss.item()

                logits = outputs.logits
                preds = torch.sigmoid(logits) > 0.5
                predictions.extend(preds.cpu().numpy())
                true_labels.extend(batch["labels"].cpu().numpy())

        val_f1 = f1_score(true_labels, predictions, average="micro")
        val_acc = accuracy_score(true_labels, predictions)

        print(f"\nEpoch {epoch+1} | "
              f"Train Loss: {epoch_loss/len(train_loader):.4f} | "
              f"Val Loss: {val_loss/len(val_loader):.4f} | "
              f"Val F1: {val_f1:.4f} | Val Acc: {val_acc:.4f}")

    print("💾 Сохранение модели...")
    model.module.save_pretrained(Config.MODEL_SAVE_PATH) if hasattr(model, "module") \
        else model.save_pretrained(Config.MODEL_SAVE_PATH)
    tokenizer.save_pretrained(Config.TOKENIZER_SAVE_PATH)

    print("✅ Обучение завершено!")

if __name__ == "__main__":
    train_model()

🔄 Загрузка данных...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.74M [00:00<?, ?B/s]

🔄 Инициализация модели...


config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Начало обучения...


  scaler = torch.cuda.amp.GradScaler(enabled=Config.FP16)


Epoch 1/3:   0%|          | 0/3319 [00:00<?, ?it/s]

  with torch.cuda.amp.autocast(enabled=Config.FP16):


In [None]:
from fastapi import FastAPI
from pydantic import BaseModel
from typing import List
from transformers import BertTokenizer, BertForSequenceClassification
from ultralytics import YOLO
import torch
import pickle
import requests
from PIL import Image
from io import BytesIO
import uvicorn

app = FastAPI(title="Strikball ML API")

print("load...")
text_model = BertForSequenceClassification.from_pretrained("rubert_category_classifier")
text_tokenizer = BertTokenizer.from_pretrained("rubert_category_classifier")

with open("mlb.pkl", "rb") as f:
    mlb = pickle.load(f)

print("load...")
image_model = YOLO("best.pt")

text_model.eval()


class Photo(BaseModel):
    photo_id: str
    url: str


class PredictionRequest(BaseModel):
    post_id: str
    text: str
    photos: List[Photo]


@app.post("/predict")
async def predict(request: PredictionRequest):
    predictions = []

    inputs = text_tokenizer(
        request.text,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=128
    )
    with torch.no_grad():
        outputs = text_model(**inputs)
        probs = torch.sigmoid(outputs.logits).squeeze().numpy()

    top_indices = probs.argsort()[-2:][::-1]
    text_preds = [(mlb.classes_[i], probs[i]) for i in top_indices]

    for i, (label, conf) in enumerate(text_preds, 1):
        predictions.append({
            "object_id": str(i),
            "categoryname": "Снаряжение и защита",
            "subcategory": label,
            "confidence": round(float(conf), 4),
            "photo_ids": []
        })

    image_id = len(predictions) + 1
    for photo in request.photos:
        try:
            response = requests.get(photo.url)
            img = Image.open(BytesIO(response.content)).convert("RGB")
            results = image_model.predict(img, conf=0.4)

            for res in results:
                for box in res.boxes:
                    cls_id = int(box.cls.item())
                    conf = float(box.conf.item())
                    label = image_model.names[cls_id]

                    predictions.append({
                        "object_id": str(image_id),
                        "categoryname": "Аксессуары и запчасти",
                        "subcategory": label,
                        "confidence": round(conf, 4),
                        "photo_ids": [photo.photo_id]
                    })
                    image_id += 1
        except Exception as e:
            print(f"Error {photo.photo_id}: {e}")

    return {
        "post_id": request.post_id,
        "predictions": predictions
    }

if __name__ == "__main__":
    uvicorn.run("http://127.0.0.1:8000/docs#/default/post__post")
