In [None]:
!unzip /content/subcategory_images.zip

Archive:  /content/subcategory_images.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of /content/subcategory_images.zip or
        /content/subcategory_images.zip.zip, and cannot find /content/subcategory_images.zip.ZIP, period.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

# Загрузка данных
posts = pd.read_parquet("/content/posts.parquet")
photos = pd.read_parquet("/content/photos.parquet")

# Переименование столбцов
posts = posts.rename(columns={'Id': 'post_id', 'Text': 'text', 'categoryname': 'category'})
photos = photos.rename(columns={'PostId': 'post_id'})

# Объединение данных
df = posts.merge(photos, on='post_id', how='left')

# Подготовка данных
df = df[['text', 'category']].dropna().drop_duplicates()

# Кодирование меток
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)

# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Обучение модели
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

# Предсказания и оценка
y_pred_lr = lr.predict(X_test_tfidf)
print("🔹 TF-IDF + Logistic Regression:")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

# Сохранение всех компонентов модели
joblib.dump(lr, 'logistic_regression_model.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')
joblib.dump(le, 'label_encoder.joblib')

print("Модель, векторизатор и кодировщик успешно сохранены!")

🔹 TF-IDF + Logistic Regression:
                       precision    recall  f1-score   support

Аксессуары и Запчасти       0.86      0.84      0.85      3031
  Снаряжение и защита       0.92      0.93      0.93      4057
 Страйкбольное оружие       0.94      0.94      0.94      3456

             accuracy                           0.91     10544
            macro avg       0.90      0.90      0.90     10544
         weighted avg       0.91      0.91      0.91     10544

Модель, векторизатор и кодировщик успешно сохранены!


In [None]:
import pandas as pd

posts = pd.read_parquet("posts.parquet")
photos = pd.read_parquet("photos.parquet")

print(posts.head())
print(photos.head())


    Id  CategoryId         categoryname  \
0  354           3  Снаряжение и защита   
1  356           3  Снаряжение и защита   
2  358           3  Снаряжение и защита   
3  360           3  Снаряжение и защита   
4  362           3  Снаряжение и защита   

                                                Text  
0  Шлем оп скор , ковер и 2 пары очков \n6000\nМо...  
1  продам кавер в мультике на 6б47 от стич профи\...  
2  продам пакеты по бр2 от Балистики на Панцирь 2...  
3  1. Крепления для активных наушников Walker's R...  
4  Чехол от бронижилета форт шерп.\nВ самом чехле...  
     Id                                                Url  DataSource  PostId
0  1230  https://sun9-30.userapi.com/impg/pJt9CEqVTt8k9...           0    1021
1  1231  https://sun9-30.userapi.com/impg/QZmLedKNvDMLR...           0    1022
2  1232  https://sun9-30.userapi.com/impg/Pf_Rh2rMYEnhP...           0    1022
3  1362  https://sun9-30.userapi.com/impg/MbdvAXG8hkyYV...           0    1117
4  1471  https:/

In [None]:
# Переименуем столбцы для удобства
posts = posts.rename(columns={'Id': 'post_id', 'Text': 'text', 'categoryname': 'category'})
photos = photos.rename(columns={'PostId': 'post_id'})

# Объединяем фото с постами
df = posts.merge(photos, on='post_id', how='left')

# Проверим результат
df[['post_id', 'text', 'category', 'Url']].head()


Unnamed: 0,post_id,text,category,Url
0,354,"Шлем оп скор , ковер и 2 пары очков \n6000\nМо...",Снаряжение и защита,https://sun9-30.userapi.com/impg/si126vdk6okug...
1,356,продам кавер в мультике на 6б47 от стич профи\...,Снаряжение и защита,https://sun9-30.userapi.com/impg/qMWfK8KdfwZAT...
2,358,продам пакеты по бр2 от Балистики на Панцирь 2...,Снаряжение и защита,https://sun9-30.userapi.com/impg/bHwIinw3QyuZz...
3,360,1. Крепления для активных наушников Walker's R...,Снаряжение и защита,https://sun9-30.userapi.com/impg/Qr8bFo3r9FpXs...
4,362,Чехол от бронижилета форт шерп.\nВ самом чехле...,Снаряжение и защита,https://sun9-30.userapi.com/impg/qwPFQe_8AurbQ...


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Убираем строки без текста или категории
df = df[['text', 'category']].dropna().drop_duplicates()

# Кодируем метки
le = LabelEncoder()
df['label'] = le.fit_transform(df['category'])

# Трен/тест
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Логистическая регрессия
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)

# Предсказания
y_pred_lr = lr.predict(X_test_tfidf)

# Оценка
print("🔹 TF-IDF + Logistic Regression:")
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))


🔹 TF-IDF + Logistic Regression:
                       precision    recall  f1-score   support

Аксессуары и Запчасти       0.86      0.84      0.85      3031
  Снаряжение и защита       0.92      0.93      0.93      4057
 Страйкбольное оружие       0.94      0.94      0.94      3456

             accuracy                           0.91     10544
            macro avg       0.90      0.90      0.90     10544
         weighted avg       0.91      0.91      0.91     10544



In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib



# Сохраняем модель
joblib.dump(lr, 'logistic_regression_model.joblib')

# Сохраняем TF-IDF векторизатор
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

# Сохраняем LabelEncoder
joblib.dump(label_encoder, 'label_encoder.joblib')

FileNotFoundError: [Errno 2] No such file or directory: 'app/models/tfidf_vectorizer.joblib'

In [None]:
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# TF-IDF векторизация
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Логистическая регрессия
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train_enc)

# Предсказания
y_pred_enc = lr.predict(X_test_tfidf)

# Оценка
print("🔹 TF-IDF + Logistic Regression:")


# Сохраняем модели и encoder
joblib.dump(tfidf, 'app/models/tfidf_vectorizer.joblib')
joblib.dump(lr, 'app/models/logistic_regression_model.joblib')
joblib.dump(le, 'app/models/label_encoder.joblib')


🔹 TF-IDF + Logistic Regression:


FileNotFoundError: [Errno 2] No such file or directory: 'app/models/tfidf_vectorizer.joblib'

In [None]:
!pip install transformers torch sentencepiece pandas scikit-learn nltk



In [None]:
import re
import pandas as pd
import torch
import numpy as np
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW  # Импорт из torch.optim вместо transformers
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 1. Установка зависимостей (выполнить в терминале перед запуском)
# pip install transformers torch sentencepiece pandas scikit-learn nltk

# 2. Загрузка стоп-слов
nltk.download('stopwords')

# 3. Функция предобработки текста
def preprocess_text(text):
    """Очистка текста от мусора и стоп-слов"""
    if not isinstance(text, str):
        return ""

    # Приводим к нижнему регистру
    text = text.lower()
    # Удаляем знаки пунктуации и спецсимволы
    text = re.sub(r'[^\w\s]', ' ', text)
    # Удаляем стоп-слова
    stop_words = set(stopwords.words('russian'))
    words = text.split()
    words = [word for word in words if word not in stop_words and len(word) > 2]
    return ' '.join(words)

# 4. Загрузка и подготовка данных
posts = pd.read_parquet("posts.parquet")
photos = pd.read_parquet("photos.parquet")
print("Загрузка данных...")
print(f"Загружено {len(posts)} записей")

# Применяем очистку текста
print("Предобработка текстов...")
posts['cleaned_text'] = posts['Text'].apply(preprocess_text)

# Кодируем категории
label_encoder = LabelEncoder()
posts['label'] = label_encoder.fit_transform(posts['categoryname'])
num_classes = len(label_encoder.classes_)

# 5. Создание датасета для PyTorch
class StrikeBallDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# 6. Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    posts['cleaned_text'],
    posts['label'],
    test_size=0.2,
    random_state=42,
    stratify=posts['label']
)

# 7. Инициализация BERT
model_name = 'cointegrated/rubert-tiny'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes
)

# 8. Параметры обучения
BATCH_SIZE = 16
MAX_LEN = 128
EPOCHS = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 9. Создание DataLoader
train_dataset = StrikeBallDataset(
    X_train.tolist(),
    y_train.tolist(),
    tokenizer,
    MAX_LEN
)

test_dataset = StrikeBallDataset(
    X_test.tolist(),
    y_test.tolist(),
    tokenizer,
    MAX_LEN
)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE
)

# 10. Обучение модели
optimizer = AdamW(model.parameters(), lr=2e-5)
model = model.to(device)

print("Начало обучения...")
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}")

# 11. Оценка модели
model.eval()
y_pred = []
y_true = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Тестирование"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs.logits, dim=1)
        y_pred.extend(preds.cpu().numpy())
        y_true.extend(labels.cpu().numpy())

print("\nОтчет о классификации:")
print(classification_report(
    y_true,
    y_pred,
    target_names=label_encoder.classes_
))

# 12. Сохранение модели
print("Сохранение модели...")
torch.save(model.state_dict(), 'bert_strikeball_classifier.pt')
tokenizer.save_pretrained('./tokenizer/')

# Сохранение label_encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.joblib')

print("Обучение завершено!")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Загрузка данных...
Загружено 66370 записей
Предобработка текстов...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/341 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/241k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/468k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Начало обучения...


Epoch 1: 100%|██████████| 3319/3319 [02:08<00:00, 25.77it/s]


Epoch 1, Train Loss: 0.4321


Epoch 2: 100%|██████████| 3319/3319 [01:42<00:00, 32.35it/s]


Epoch 2, Train Loss: 0.2635


Epoch 3: 100%|██████████| 3319/3319 [01:36<00:00, 34.24it/s]


Epoch 3, Train Loss: 0.2200


Тестирование: 100%|██████████| 830/830 [00:21<00:00, 38.97it/s]



Отчет о классификации:
                       precision    recall  f1-score   support

Аксессуары и Запчасти       0.89      0.88      0.88      3669
  Снаряжение и защита       0.95      0.89      0.92      5087
 Страйкбольное оружие       0.89      0.96      0.93      4518

             accuracy                           0.91     13274
            macro avg       0.91      0.91      0.91     13274
         weighted avg       0.91      0.91      0.91     13274

Сохранение модели...
Обучение завершено!
