In [None]:
import pandas as pd
import re
from tqdm import tqdm
import ast

print("–ó–∞–≥—Ä—É–∂–∞–µ–º –¥–∞–Ω–Ω—ã–µ...")
df = pd.read_csv('dataset/habr.csv', encoding='utf-8')
print("–†–∞–∑–º–µ—Ä –∏—Å—Ö–æ–¥–Ω–æ–≥–æ –¥–∞—Ç–∞—Å–µ—Ç–∞: {len(df)}")

def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'[^–∞-—è–∞-—è—ë—ëa-za-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def parse_hubs(hub_str):
    if pd.isna(hub_str):
        return []
    try:
        return ast.literal_eval(hub_str)
    except:
        if isinstance(hub_str, str):
            return [hub.strip() for hub in hub_str.split(',')]
        return []

print("–û—á–∏—â–∞–µ–º —Ç–µ–∫—Å—Ç—ã...")
df['hubs'] = df['hubs'].apply(parse_hubs)

text_columns = ['title', 'keywords', 'text']
for col in text_columns:
    df[f'cleaned_{col}'] = pd.Series(tqdm(
        (preprocess_text(text) for text in df[col]),
        total=len(df),
        desc=f"–û–±—Ä–∞–±–æ—Ç–∫–∞ {col}"
    ))

df['full_text'] = (
    df['cleaned_title'] + " " +
    df['cleaned_keywords'].fillna("") + " " +
    df['cleaned_text']
)

df = df[df['full_text'].str.len() > 50]
hub_counts = df['hubs'].explode().value_counts()
valid_hubs = hub_counts[hub_counts >= 10].index
df['hubs'] = df['hubs'].apply(lambda hubs: [h for h in hubs if h in valid_hubs])
df = df[df['hubs'].map(len) >= 3]

print(f"–ò—Ç–æ–≥–æ–≤—ã–π —Ä–∞–∑–º–µ—Ä: {len(df)}")

df.to_pickle("processed_habr.pkl")
print("–î–∞–Ω–Ω—ã–µ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤ processed_habr.pkl")


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score, hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import vstack
import time
import numpy as np

df = pd.read_pickle("data/processed_habr.pkl")

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['hubs'])
print(f"–•–∞–±—ã: {len(mlb.classes_)}, –°—Ç–∞—Ç—å–∏: {df.shape[0]}")

min_hub_count = 2
valid_hub_indices = y.sum(axis=0) >= min_hub_count
y_filtered = y[:, valid_hub_indices]

print(f"–ò—Å—Ö–æ–¥–Ω–æ–µ —á–∏—Å–ª–æ —Ö–∞–±–æ–≤: {y.shape[1]}")
print(f"–ü–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: {y_filtered.shape[1]} —Ö–∞–±–æ–≤")

mlb_filtered = MultiLabelBinarizer()
filtered_hubs = []
for hubs_list in df['hubs']:
    filtered_hub = [hub for hub in hubs_list if hub in mlb.classes_[valid_hub_indices]]
    filtered_hubs.append(filtered_hub)

y = mlb_filtered.fit_transform(filtered_hubs)

class_counts = y.sum(axis=0)
print(f"–ú–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤ –∫–ª–∞—Å—Å–µ: {class_counts.min()}")
print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤ –∫–ª–∞—Å—Å–µ: {class_counts.max()}")

non_empty_indices = y.sum(axis=1) > 0
df = df[non_empty_indices]
y = y[non_empty_indices]

print(f"–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö: {len(df)}")

print("–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...")
start_time = time.time()

russian_stopwords = [
    '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
    '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞'
]

vectorizer = TfidfVectorizer(
    max_features=3000,
    ngram_range=(1, 1),
    min_df=1,
    max_df=0.95,
    stop_words=russian_stopwords,
    sublinear_tf=True,
    lowercase=True,
    smooth_idf=True
)

texts = df['full_text'].tolist()
X = vectorizer.fit_transform(texts)  

print(f"\n–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞ –∑–∞ {time.time() - start_time:.2f} —Å–µ–∫")
print(f"–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å: {X.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42  
)

print(f"–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: {X_train.shape[0]}")
print(f"–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞: {X_test.shape[0]}")

print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
model = MultiOutputClassifier(
    LogisticRegression(
        max_iter=1000, 
        class_weight='balanced', 
        C=0.5, 
        solver='liblinear', 
        random_state=42
    ),
    n_jobs=-1
)
model.fit(X_train, y_train)

# --- –û–¶–ï–ù–ö–ê ---
y_pred = model.predict(X_test)
jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)

print(f"\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:")
print(f"Jaccard Score: {jaccard:.4f}")
print(f"Hamming Loss: {hamming:.4f}")

# –î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –∏–Ω—Ñ–æ—Ä–º–∞—Ü–∏—è –æ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è—Ö
print(f"\nüìä –î–µ—Ç–∞–ª–∏:")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Å–æ–≤: {y.shape[1]}")
print(f"–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–æ –Ω–µ-–Ω—É–ª–µ–π: {(y_pred.sum(axis=1) > 0).sum()} / {y_pred.shape[0]}")
print(f"–°—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ö–∞–±–æ–≤ –Ω–∞ —Å—Ç–∞—Ç—å—é: {y.sum(axis=1).mean():.2f}")

–•–∞–±—ã: 955, –°—Ç–∞—Ç—å–∏: 98064
–ò—Å—Ö–æ–¥–Ω–æ–µ —á–∏—Å–ª–æ —Ö–∞–±–æ–≤: 955
–ü–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: 953 —Ö–∞–±–æ–≤
–ú–∏–Ω–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤ –∫–ª–∞—Å—Å–µ: 2
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∏–º–µ—Ä–æ–≤ –≤ –∫–ª–∞—Å—Å–µ: 13906
–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —É–¥–∞–ª–µ–Ω–∏—è –ø—É—Å—Ç—ã—Ö: 98064
–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...

–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞–≤–µ—Ä—à–µ–Ω–∞ –∑–∞ 149.50 —Å–µ–∫
–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å: (98064, 3000)
–û–±—É—á–∞—é—â–∞—è –≤—ã–±–æ—Ä–∫–∞: 78451
–¢–µ—Å—Ç–æ–≤–∞—è –≤—ã–±–æ—Ä–∫–∞: 19613
–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...

‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:
Jaccard Score: 0.2451
Hamming Loss: 0.0114

üìä –î–µ—Ç–∞–ª–∏:
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∫–ª–∞—Å—Å–æ–≤: 953
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–æ –Ω–µ-–Ω—É–ª–µ–π: 19613 / 19613
–°—Ä–µ–¥–Ω–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ö–∞–±–æ–≤ –Ω–∞ —Å—Ç–∞—Ç—å—é: 3.90


In [2]:
import pandas as pd
import yake
import re
import os
from tqdm import tqdm

print("–ó–∞–≥—Ä—É–∂–∞–µ–º processed_habr.pkl...")
if not os.path.exists("data/processed_habr.pkl"):
    raise FileNotFoundError("–§–∞–π–ª processed_habr.pkl –Ω–µ –Ω–∞–π–¥–µ–Ω!")

df = pd.read_pickle("data/processed_habr.pkl")
print(f"–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: {len(df)}")

N_SAMPLES = 20000
if len(df) > N_SAMPLES:
    df_sample = df.head(N_SAMPLES).copy()
    print(f"–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –≤—ã–±–æ—Ä–∫—É: {len(df_sample)} –∑–∞–ø–∏—Å–µ–π (–ø–µ—Ä–≤—ã–µ {N_SAMPLES})")
else:
    df_sample = df.copy()
    print(f"–î–∞—Ç–∞—Å–µ—Ç –º–µ–Ω—å—à–µ {N_SAMPLES}, –æ–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –≤—Å–µ {len(df_sample)} –∑–∞–ø–∏—Å–µ–π")


kw_extractor = yake.KeywordExtractor(
    lan="ru",
    n=3,
    top=15,
    dedupLim=0.7,
    features=None
)

russian_stopwords = {
    '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
    '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞', '—É', '–∂–µ',
    '–±—ã', '–¥–ª—è', '–ø–æ', '–æ', '–æ—Ç', '–∏–∑', '–∫', '–æ–±', '–ø—Ä–∏', '–Ω–∞–¥', '–ø–æ–¥'
}

def extractKeys(text):
    """–ò–∑–≤–ª–µ–∫–∞–µ—Ç –∫–ª—é—á–µ–≤—ã–µ —Ñ—Ä–∞–∑—ã —Å –ø–æ–º–æ—â—å—é YAKE."""
    if not isinstance(text, str) or len(text.strip()) < 10:
        return []
    try:
        keywords = kw_extractor.extract_keywords(text)
        result = []
        for phrase, score in keywords:
            words = phrase.lower().split()
            if any(word not in russian_stopwords for word in words):
                result.append(phrase.strip())
        return result
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ —Ç–µ–∫—Å—Ç–∞: {e}")
        return []

print("–ò–∑–≤–ª–µ–∫–∞–µ–º –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –∏–∑ cleaned_text —Å –ø–æ–º–æ—â—å—é YAKE...")
df_sample['text_main'] = pd.Series(tqdm(
    (extractKeys(text) for text in df_sample['cleaned_text']),
    total=len(df_sample),
    desc="YAKE: –∏–∑–≤–ª–µ—á–µ–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Ñ—Ä–∞–∑",
    unit="—Ç–µ–∫—Å—Ç"
))

print("–°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–æ–º–µ–∂—É—Ç–æ—á–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–æ—Å–ª–µ YAKE...")
df_sample.to_pickle("data/sample_20k_yake_extracted.pkl")
print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: data/sample_20k_yake_extracted.pkl")

print("–û—á–∏—â–∞–µ–º –∏–∑–≤–ª–µ—á—ë–Ω–Ω—ã–µ –∫–ª—é—á–µ–≤—ã–µ —Ñ—Ä–∞–∑—ã...")

def clean_keywords(keywords):
    if not isinstance(keywords, list):
        return []
    
    if not keywords:
        return []
    
    cleaned = []
    for phrase in keywords:
        if not isinstance(phrase, str):
            phrase = str(phrase)
        
        phrase = re.sub(r'[^–∞-—è–ê-–Ø—ë—ëa-zA-Z0-9\s]', ' ', phrase)
        phrase = re.sub(r'\s+', ' ', phrase).strip()
        if phrase:
            cleaned.append(phrase)
    return cleaned

df_sample['text_main'] = df_sample['text_main'].apply(clean_keywords)

print("–î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞: —É–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è...")
def final_clean(keywords):
    if not keywords:
        return []
    seen = set()
    result = []
    for k in keywords:
        if k not in seen:
            seen.add(k)
            result.append(k)
    return result

df_sample['text_main'] = df_sample['text_main'].apply(final_clean)

print("–°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Ç–æ–≥–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ...")
df_sample.to_pickle("data/sample_20k_with_keywords.pkl")
df_sample.to_csv("data/sample_20k_keywords.csv", index=False, encoding='utf-8')


print("\n–ì–æ—Ç–æ–≤–æ!")
print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {len(df_sample)} —Ç–µ–∫—Å—Ç–æ–≤")
print(f"–ò—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª: sample_20k_with_keywords.pkl")
print(f"CSV-–≤–µ—Ä—Å–∏—è: sample_20k_keywords.csv")

print(f"\n–ü—Ä–∏–º–µ—Ä –ø–µ—Ä–≤—ã—Ö 15 –∑–Ω–∞—á–µ–Ω–∏–π 'text_main':")
for i in range(15):
    if i < len(df_sample):
        print(f"{i}: {df_sample['text_main'].iloc[i]}")


–ó–∞–≥—Ä—É–∂–∞–µ–º processed_habr.pkl...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: 98064
–û–±—Ä–∞–±–∞—Ç—ã–≤–∞–µ–º –≤—ã–±–æ—Ä–∫—É: 20000 –∑–∞–ø–∏—Å–µ–π (–ø–µ—Ä–≤—ã–µ 20000)
–ò–∑–≤–ª–µ–∫–∞–µ–º –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –∏–∑ cleaned_text —Å –ø–æ–º–æ—â—å—é YAKE...


YAKE: –∏–∑–≤–ª–µ—á–µ–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Ñ—Ä–∞–∑: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20000/20000 [56:15<00:00,  5.92—Ç–µ–∫—Å—Ç/s]  


–°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–æ–º–µ–∂—É—Ç–æ—á–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–æ—Å–ª–µ YAKE...
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: sample_20k_yake_extracted.pkl
–û—á–∏—â–∞–µ–º –∏–∑–≤–ª–µ—á—ë–Ω–Ω—ã–µ –∫–ª—é—á–µ–≤—ã–µ —Ñ—Ä–∞–∑—ã...
–î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞: —É–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è...
–°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Ç–æ–≥–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ...

–ì–æ—Ç–æ–≤–æ!
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 20000 —Ç–µ–∫—Å—Ç–æ–≤
–ò—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª: sample_20k_with_keywords.pkl
CSV-–≤–µ—Ä—Å–∏—è: sample_20k_keywords.csv

–ü—Ä–∏–º–µ—Ä –ø–µ—Ä–≤—ã—Ö 15 –∑–Ω–∞—á–µ–Ω–∏–π 'text_main':
0: ['–∫–æ–º–ø–∞–Ω–∏—é –ª–∏–±–æ —Å–∞–º—ã–π', '–ª–∏–±–æ —Å–∞–º—ã–π —É–º–Ω—ã–π', '—Å–∞–º—ã–π —É–º–Ω—ã–π –ª–∏–±–æ', '—É–º–Ω—ã–π –ª–∏–±–æ —Å–∞–º—ã–π', '–ª–∏–±–æ —Å–∞–º—ã–π –≥–ª—É–ø—ã–π', '–≤–∏–∫–∏–ø–µ–¥–∏—è –Ω–µ–±–æ–ª—å—à–∞—è –∫–æ–º–ø–∞–Ω–∏—è', '–≤–∏–∫–∏–ø–µ–¥–∏—è –Ω–µ–±–æ–≥–∞—Ç–∞—è –∫–æ–º–ø–∞–Ω–∏—è', '—Å—Ç–∞—Ç–µ–π –≤–∫–ª—é—á–∞—è —Ä–∞–∑–¥–µ–ª—ã', '–Ω–∞–ø—Ä–∏–º–µ—Ä —à–≤–µ–¥—Å–∫–∞—è –≤–∏–∫–∏–ø–µ–¥–∏—è', '—è–∑—ã–∫–æ–≤ –

In [49]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score, hamming_loss, f1_score, precision_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import hstack  
import time

df = pd.read_pickle("data/sample_20k_with_keywords.pkl")

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['hubs'])
valid_hub_mask = y.sum(axis=0) >= 2 
y_filtered = y[:, valid_hub_mask]

filtered_hubs = [
    [hub for hub in hubs if hub in mlb.classes_[valid_hub_mask]]
    for hubs in df['hubs']
]
mlb_filtered = MultiLabelBinarizer()
y = mlb_filtered.fit_transform(filtered_hubs)

non_empty = y.sum(axis=1) > 0
df = df[non_empty].copy()
y = y[non_empty]

print(f"–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: {len(df)}, —Ö–∞–±–æ–≤: {y.shape[1]}")

print("–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...")
start_time = time.time()

df['text_main_str'] = df['text_main'].apply(lambda x: ' '.join(x) if x else '')
df['cleaned_keywords_str'] = df['cleaned_keywords'].fillna('').astype(str)
df['full_text_with_yake'] = (
    df['cleaned_title'].fillna('') + ' ' +
    df['cleaned_keywords_str'] + ' ' +
    df['text_main_str']
)

mask = df['full_text_with_yake'].str.len() > 50
df = df[mask]
y = y[mask.values] 

vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 4),
    min_df=3,
    max_df=0.75,
    stop_words=[
        '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
        '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞'
    ],
    sublinear_tf=True,
    lowercase=True,
    smooth_idf=True,
    norm='l2'
)
X_text = vectorizer.fit_transform(df['full_text_with_yake'])

username_vectorizer = TfidfVectorizer(
    analyzer='char_wb',      
    ngram_range=(2, 4),    
    min_df=1,             
    max_features=500,     
    sublinear_tf=True,
    lowercase=True
)
X_username = username_vectorizer.fit_transform(df['username'].fillna(''))

X = hstack([X_text, X_username])  

print(f"!–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ {time.time() - start_time:.2f} —Å–µ–∫, X: {X.shape}, y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
modelNew = MultiOutputClassifier(
    LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=0.5,
        solver='liblinear',
        random_state=42
    ),
    n_jobs=-1
)
modelNew.fit(X_train, y_train)

y_pred = modelNew.predict(X_test)
jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)

print(f"\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:")
print(f"Jaccard Score: {jaccard:.4f}")
print(f"Hamming Loss:  {hamming:.4f}")
print(f"–ö–ª–∞—Å—Å–æ–≤: {y.shape[1]}, —Å—Ç–∞—Ç–µ–π: {len(df)}")


–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: 20000, —Ö–∞–±–æ–≤: 698
–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...
!–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ 9.94 —Å–µ–∫, X: (19764, 20500), y: (19764, 698)
–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...

‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:
Jaccard Score: 0.3788
Hamming Loss:  0.0073
–ö–ª–∞—Å—Å–æ–≤: 698, —Å—Ç–∞—Ç–µ–π: 19764


In [1]:
import pandas as pd
import yake
import re
import os
from tqdm import tqdm

print("–ó–∞–≥—Ä—É–∂–∞–µ–º processed_habr.pkl...")
if not os.path.exists("data/processed_habr.pkl"):
    raise FileNotFoundError("–§–∞–π–ª processed_habr.pkl –Ω–µ –Ω–∞–π–¥–µ–Ω!")

df = pd.read_pickle("data/processed_habr.pkl")
print(f"–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: {len(df)}")

df_sample = df.copy()

kw_extractor = yake.KeywordExtractor(
    lan="ru",
    n=3,
    top=15,
    dedupLim=0.7,
    features=None
)

russian_stopwords = {
    '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
    '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞', '—É', '–∂–µ',
    '–±—ã', '–¥–ª—è', '–ø–æ', '–æ', '–æ—Ç', '–∏–∑', '–∫', '–æ–±', '–ø—Ä–∏', '–Ω–∞–¥', '–ø–æ–¥'
}

def extractKeys(text):
    if not isinstance(text, str) or len(text.strip()) < 10:
        return []
    try:
        keywords = kw_extractor.extract_keywords(text)
        result = []
        for phrase, score in keywords:
            words = phrase.lower().split()
            if any(word not in russian_stopwords for word in words):
                result.append(phrase.strip())
        return result
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—Ä–∞–±–æ—Ç–∫–µ —Ç–µ–∫—Å—Ç–∞: {e}")
        return []

print("–ò–∑–≤–ª–µ–∫–∞–µ–º –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –∏–∑ cleaned_text —Å –ø–æ–º–æ—â—å—é YAKE...")
df_sample['text_main'] = pd.Series(tqdm(
    (extractKeys(text) for text in df_sample['cleaned_text']),
    total=len(df_sample),
    desc="YAKE: –∏–∑–≤–ª–µ—á–µ–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Ñ—Ä–∞–∑",
    unit="—Ç–µ–∫—Å—Ç"
))

print("–°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–æ–º–µ–∂—É—Ç–æ—á–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–æ—Å–ª–µ YAKE...")
df_sample.to_pickle("data_with_main_info_extract.pkl")
print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: data_with_main_info_extract.pkl")

print("–û—á–∏—â–∞–µ–º –∏–∑–≤–ª–µ—á—ë–Ω–Ω—ã–µ –∫–ª—é—á–µ–≤—ã–µ —Ñ—Ä–∞–∑—ã...")

def clean_keywords(keywords):
    if not isinstance(keywords, list):
        return []
    
    if not keywords:
        return []
    
    cleaned = []
    for phrase in keywords:
        if not isinstance(phrase, str):
            phrase = str(phrase)
        
        phrase = re.sub(r'[^–∞-—è–ê-–Ø—ë—ëa-zA-Z0-9\s]', ' ', phrase)
        phrase = re.sub(r'\s+', ' ', phrase).strip()
        if phrase:
            cleaned.append(phrase)
    return cleaned

df_sample['text_main'] = df_sample['text_main'].apply(clean_keywords)

print("–î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞: —É–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è...")
def final_clean(keywords):
    if not keywords:
        return []
    seen = set()
    result = []
    for k in keywords:
        if k not in seen:
            seen.add(k)
            result.append(k)
    return result

df_sample['text_main'] = df_sample['text_main'].apply(final_clean)

print("–°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Ç–æ–≥–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ...")
df_sample.to_pickle("data_with_main_info.pkl")
df_sample.to_csv("data_with_main_info.csv", index=False, encoding='utf-8')


print("\n–ì–æ—Ç–æ–≤–æ!")
print(f"–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: {len(df_sample)} —Ç–µ–∫—Å—Ç–æ–≤")
print(f"–ò—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª: data_with_main_info.pkl")
print(f"CSV-–≤–µ—Ä—Å–∏—è: data_with_main_info.csv")

print(f"\n–ü—Ä–∏–º–µ—Ä –ø–µ—Ä–≤—ã—Ö 15 –∑–Ω–∞—á–µ–Ω–∏–π 'text_main':")
for i in range(15):
    if i < len(df_sample):
        print(f"{i}: {df_sample['text_main'].iloc[i]}")


–ó–∞–≥—Ä—É–∂–∞–µ–º processed_habr.pkl...
–ò—Å—Ö–æ–¥–Ω—ã–π —Ä–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: 98064
–ò–∑–≤–ª–µ–∫–∞–µ–º –∫–ª—é—á–µ–≤—ã–µ —Å–ª–æ–≤–∞ –∏–∑ cleaned_text —Å –ø–æ–º–æ—â—å—é YAKE...


YAKE: –∏–∑–≤–ª–µ—á–µ–Ω–∏–µ –∫–ª—é—á–µ–≤—ã—Ö —Ñ—Ä–∞–∑: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98064/98064 [4:36:28<00:00,  5.91—Ç–µ–∫—Å—Ç/s]   


–°–æ—Ö—Ä–∞–Ω—è–µ–º –ø—Ä–æ–º–µ–∂—É—Ç–æ—á–Ω—ã–π —Ä–µ–∑—É–ª—å—Ç–∞—Ç –ø–æ—Å–ª–µ YAKE...
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: data_with_main_info_extract.pkl
–û—á–∏—â–∞–µ–º –∏–∑–≤–ª–µ—á—ë–Ω–Ω—ã–µ –∫–ª—é—á–µ–≤—ã–µ —Ñ—Ä–∞–∑—ã...
–î–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–∞—è –æ—á–∏—Å—Ç–∫–∞: —É–¥–∞–ª–µ–Ω–∏–µ –¥—É–±–ª–∏–∫–∞—Ç–æ–≤ –∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è...
–°–æ—Ö—Ä–∞–Ω—è–µ–º –∏—Ç–æ–≥–æ–≤—ã–µ –¥–∞–Ω–Ω—ã–µ...

–ì–æ—Ç–æ–≤–æ!
–û–±—Ä–∞–±–æ—Ç–∞–Ω–æ: 98064 —Ç–µ–∫—Å—Ç–æ–≤
–ò—Ç–æ–≥–æ–≤—ã–π —Ñ–∞–π–ª: data_with_main_info.pkl
CSV-–≤–µ—Ä—Å–∏—è: data_with_main_info.csv

–ü—Ä–∏–º–µ—Ä –ø–µ—Ä–≤—ã—Ö 15 –∑–Ω–∞—á–µ–Ω–∏–π 'text_main':
0: ['–∫–æ–º–ø–∞–Ω–∏—é –ª–∏–±–æ —Å–∞–º—ã–π', '–ª–∏–±–æ —Å–∞–º—ã–π —É–º–Ω—ã–π', '—Å–∞–º—ã–π —É–º–Ω—ã–π –ª–∏–±–æ', '—É–º–Ω—ã–π –ª–∏–±–æ —Å–∞–º—ã–π', '–ª–∏–±–æ —Å–∞–º—ã–π –≥–ª—É–ø—ã–π', '–≤–∏–∫–∏–ø–µ–¥–∏—è –Ω–µ–±–æ–ª—å—à–∞—è –∫–æ–º–ø–∞–Ω–∏—è', '–≤–∏–∫–∏–ø–µ–¥–∏—è –Ω–µ–±–æ–≥–∞—Ç–∞—è –∫–æ–º–ø–∞–Ω–∏—è', '—Å—Ç–∞—Ç–µ–π –≤–∫–ª—é—á–∞—è —Ä–∞–∑–¥–µ–ª—ã', '–Ω–∞–ø—Ä–∏–º–µ—Ä —à–≤–µ–¥—Å–∫–∞—è –≤–∏–∫–∏–ø–µ–¥–∏—è', '—è–∑—ã–∫–æ–≤ –±–ª

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score, hamming_loss, f1_score, precision_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from scipy.sparse import hstack 
import time
import numpy as np
import joblib  

df = pd.read_pickle("data/data_with_main_info.pkl")

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['hubs'])
valid_hub_mask = y.sum(axis=0) >= 2  
y_filtered = y[:, valid_hub_mask]

filtered_hubs = [
    [hub for hub in hubs if hub in mlb.classes_[valid_hub_mask]]
    for hubs in df['hubs']
]
mlb_filtered = MultiLabelBinarizer()
y = mlb_filtered.fit_transform(filtered_hubs)

non_empty = y.sum(axis=1) > 0
df = df[non_empty].copy()
y = y[non_empty]

print(f"–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: {len(df)}, —Ö–∞–±–æ–≤: {y.shape[1]}")

print("–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...")
start_time = time.time()

df['text_main_str'] = df['text_main'].apply(lambda x: ' '.join(x) if x else '')
df['cleaned_keywords_str'] = df['cleaned_keywords'].fillna('').astype(str)
df['full_text_with_yake'] = (
    df['cleaned_title'].fillna('') + ' ' +
    df['cleaned_keywords_str'] + ' ' +
    df['text_main_str']
)

mask = df['full_text_with_yake'].str.len() > 50
df = df[mask]
y = y[mask.values] 

df['datetime'] = pd.to_datetime(df['time'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour

time_features = ['year', 'month', 'dayofweek', 'hour']
time_encoder = OneHotEncoder(sparse_output=True, drop='first')
X_time = time_encoder.fit_transform(df[time_features])

vectorizer = TfidfVectorizer(
    max_features=40000,           
    ngram_range=(1, 3),           
    min_df=2,                     
    max_df=0.8,                
    stop_words=[
        '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
        '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞'
    ],
    sublinear_tf=True,
    lowercase=True,
    smooth_idf=True,
    norm='l2',
    use_idf=True,
    binary=False
)
X_text = vectorizer.fit_transform(df['full_text_with_yake'])

username_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 4),
    min_df=1,
    max_features=1500,
    sublinear_tf=True,
    lowercase=True
)
X_username = username_vectorizer.fit_transform(df['username'].fillna(''))

X = hstack([X_text, X_username, X_time])

print(f"!–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ {time.time() - start_time:.2f} —Å–µ–∫, X: {X.shape}, y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
modelFull = MultiOutputClassifier(
    LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        C=0.5,
        solver='liblinear',
        random_state=42,
        tol=1e-4,
        fit_intercept=True
    ),
    n_jobs=-1
)
modelFull.fit(X_train, y_train)

y_pred = modelFull.predict(X_test)
jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)

print(f"\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:")
print(f"Jaccard Score: {jaccard:.4f}")
print(f"Hamming Loss:  {hamming:.4f}")
print(f"–ö–ª–∞—Å—Å–æ–≤: {y.shape[1]}, —Å—Ç–∞—Ç–µ–π: {len(df)}")

joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(username_vectorizer, 'username_vectorizer.pkl')
joblib.dump(time_encoder, 'time_encoder.pkl')
joblib.dump(mlb_filtered, 'mlb_filtered.pkl')
joblib.dump(modelFull, 'helpers/modelFull.pkl')  


–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: 98064, —Ö–∞–±–æ–≤: 953
–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...
!–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ 43.58 —Å–µ–∫, X: (97731, 41549), y: (97731, 953)
–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...

‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã:
Jaccard Score: 0.3700
Hamming Loss:  0.0060
–ö–ª–∞—Å—Å–æ–≤: 953, —Å—Ç–∞—Ç–µ–π: 97731


['modelFull.pkl']

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import jaccard_score, hamming_loss, f1_score, precision_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder, StandardScaler
from scipy.sparse import hstack, csr_matrix
import time
import numpy as np
import joblib
from collections import Counter

df = pd.read_pickle("data/data_with_main_info.pkl")

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df['hubs'])

min_samples_per_class = max(3, int(len(df) * 0.001))  
valid_hub_mask = y.sum(axis=0) >= min_samples_per_class
y_filtered = y[:, valid_hub_mask]

filtered_hubs = [
    [hub for hub in hubs if hub in mlb.classes_[valid_hub_mask]]
    for hubs in df['hubs']
]
mlb_filtered = MultiLabelBinarizer()
y = mlb_filtered.fit_transform(filtered_hubs)

non_empty = y.sum(axis=1) > 0
df = df[non_empty].copy()
y = y[non_empty]

print(f"–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: {len(df)}, —Ö–∞–±–æ–≤: {y.shape[1]}")
print(f"–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ö–∞–±–æ–≤: {dict(Counter(y.sum(axis=1)).most_common(10))}")

print("\n–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...")
start_time = time.time()

df['text_main_str'] = df['text_main'].apply(lambda x: ' '.join(x) if x else '')
df['cleaned_keywords_str'] = df['cleaned_keywords'].fillna('').astype(str)

df['full_text_weighted'] = (
    df['cleaned_title'].fillna('') + ' ' +
    df['cleaned_title'].fillna('') + ' ' +
    df['cleaned_keywords_str'] + ' ' +
    df['text_main_str']
)

min_text_length = 50
mask = df['full_text_weighted'].str.len() > min_text_length
df = df[mask]
y = y[mask.values]

df['datetime'] = pd.to_datetime(df['time'])
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['dayofweek'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
df['quarter'] = df['datetime'].dt.quarter

time_features = ['year', 'month', 'dayofweek', 'hour', 'is_weekend', 'quarter']
time_encoder = OneHotEncoder(sparse_output=True, drop='first')
X_time = time_encoder.fit_transform(df[time_features])

vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8,
    stop_words=[
        '–∏', '–≤', '–≤–æ', '–Ω–µ', '—á—Ç–æ', '–æ–Ω', '–Ω–∞', '—è', '—Å', '—Å–æ',
        '–∫–∞–∫', '–∞', '—Ç–æ', '–≤—Å–µ', '–æ–Ω–∞', '—Ç–∞–∫', '–µ–≥–æ', '–Ω–æ', '–¥–∞',
        '—Ç—ã', '–ø–æ', '–Ω–æ', '–∑–∞', '–∏–∑', '—ç—Ç–æ', '–∏–ª–∏', '—É', '–∂–µ', '–±—ã',
        '–≤–æ—Ç', '–æ—Ç', '–º–µ–Ω—è', '–µ–º—É', '–Ω–µ—Ç', '–æ', '–µ—â–µ', '–∫–æ–≥–¥–∞',
        '–¥–∞–∂–µ', '–Ω—É', '–ª–∏', '–µ—Å–ª–∏', '–±—ã–ª', '–¥–æ', '–Ω–∏', '–±—ã—Ç—å',
        '–ø—Ä–∏', '—Ç–∞–∫–∂–µ', '–∫', '–ø–æ', '–Ω–∞', '—ç—Ç–æ—Ç', '—á—Ç–æ', '–∫–æ—Ç–æ—Ä—ã–π'
    ],
    sublinear_tf=True,
    lowercase=True,
    smooth_idf=True,
    norm='l2',
    use_idf=True,
    binary=False,
    analyzer='word'
)

X_text = vectorizer.fit_transform(df['full_text_weighted'])

username_vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 4),
    min_df=2,
    max_features=1500,
    sublinear_tf=True,
    lowercase=True,
    binary=True
)
X_username = username_vectorizer.fit_transform(df['username'].fillna(''))

df['text_length'] = df['full_text_weighted'].str.len()
df['word_count'] = df['full_text_weighted'].str.split().str.len()

scaler = StandardScaler(with_mean=False)
X_numeric = csr_matrix(scaler.fit_transform(df[['text_length', 'word_count']]))

X = hstack([X_text, X_username, X_time, X_numeric])
print(f"–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ {time.time() - start_time:.2f} —Å–µ–∫, X: {X.shape}, y: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\n–†–∞–∑–º–µ—Ä train: {X_train.shape}, test: {X_test.shape}")

print("\n–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
model = MultiOutputClassifier(
    LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        C=0.8,
        solver='liblinear',
        penalty='l2',
        random_state=42,
        tol=1e-4,
        fit_intercept=True
    ),
    n_jobs=-1
)

print("–ö–†–û–°–°-–í–ê–õ–ò–î–ê–¶–ò–Ø (5 —Ñ–æ–ª–¥–æ–≤)")

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_jaccard_scores = []
fold_f1_micro_scores = []
fold_f1_macro_scores = []
fold_hamming_losses = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
    print(f"\n–§–æ–ª–¥ {fold}/5:")
    
    X_train_fold, X_val_fold = X_train[train_idx], X_train[val_idx]
    y_train_fold, y_val_fold = y_train[train_idx], y_train[val_idx]
    
    model_fold = MultiOutputClassifier(
        LogisticRegression(
            max_iter=1000,
            class_weight='balanced',
            C=0.8,
            solver='liblinear',
            random_state=42 + fold
        ),
        n_jobs=-1
    )
    
    model_fold.fit(X_train_fold, y_train_fold)
    
    y_pred_fold = model_fold.predict(X_val_fold)
    
    jaccard_fold = jaccard_score(y_val_fold, y_pred_fold, average='samples')
    f1_micro_fold = f1_score(y_val_fold, y_pred_fold, average='micro')
    f1_macro_fold = f1_score(y_val_fold, y_pred_fold, average='macro')
    hamming_fold = hamming_loss(y_val_fold, y_pred_fold)
    
    fold_jaccard_scores.append(jaccard_fold)
    fold_f1_micro_scores.append(f1_micro_fold)
    fold_f1_macro_scores.append(f1_macro_fold)
    fold_hamming_losses.append(hamming_fold)
    
    print(f"  Jaccard: {jaccard_fold:.4f}, F1 Micro: {f1_micro_fold:.4f}, Hamming Loss: {hamming_fold:.4f}")

print("–†–ï–ó–£–õ–¨–¢–ê–¢–´ –ö–†–û–°–°-–í–ê–õ–ò–î–ê–¶–ò–ò")

print(f"Jaccard Score (samples):")
print(f"  –§–æ–ª–¥—ã: {[f'{v:.4f}' for v in fold_jaccard_scores]}")
print(f"  –°—Ä–µ–¥–Ω–µ–µ: {np.mean(fold_jaccard_scores):.4f} (¬±{np.std(fold_jaccard_scores):.4f})")

print(f"\nF1 Micro:")
print(f"  –§–æ–ª–¥—ã: {[f'{v:.4f}' for v in fold_f1_micro_scores]}")
print(f"  –°—Ä–µ–¥–Ω–µ–µ: {np.mean(fold_f1_micro_scores):.4f} (¬±{np.std(fold_f1_micro_scores):.4f})")

print(f"\nF1 Macro:")
print(f"  –§–æ–ª–¥—ã: {[f'{v:.4f}' for v in fold_f1_macro_scores]}")
print(f"  –°—Ä–µ–¥–Ω–µ–µ: {np.mean(fold_f1_macro_scores):.4f} (¬±{np.std(fold_f1_macro_scores):.4f})")

print(f"\nHamming Loss:")
print(f"  –§–æ–ª–¥—ã: {[f'{v:.4f}' for v in fold_hamming_losses]}")
print(f"  –°—Ä–µ–¥–Ω–µ–µ: {np.mean(fold_hamming_losses):.4f} (¬±{np.std(fold_hamming_losses):.4f})")

print("–û–ë–£–ß–ï–ù–ò–ï –§–ò–ù–ê–õ–¨–ù–û–ô –ú–û–î–ï–õ–ò")

model.fit(X_train, y_train)

print("–û–¶–ï–ù–ö–ê –ù–ê –¢–ï–°–¢–û–í–´–• –î–ê–ù–ù–´–•")

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

jaccard = jaccard_score(y_test, y_pred, average='samples')
hamming = hamming_loss(y_test, y_pred)
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_macro = f1_score(y_test, y_pred, average='macro')
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')

print(f"\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö:")
print(f"Jaccard Score (samples): {jaccard:.4f}")
print(f"Hamming Loss: {hamming:.4f}")
print(f"F1 Micro: {f1_micro:.4f}")
print(f"F1 Macro: {f1_macro:.4f}")
print(f"Precision Micro: {precision_micro:.4f}")
print(f"Recall Micro: {recall_micro:.4f}")
print(f"–ö–ª–∞—Å—Å–æ–≤: {y.shape[1]}, —Å—Ç–∞—Ç–µ–π: {len(df)}")

print("–≠–ö–°–ü–ï–†–ò–ú–ï–ù–¢ –° –†–ê–ó–ù–´–ú–ò –ü–û–†–û–ì–ê–ú–ò")

thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
for threshold in thresholds:
    y_pred_thresholded = np.array([(proba[:, 1] > threshold).astype(int) for proba in y_pred_proba]).T
    jaccard_th = jaccard_score(y_test, y_pred_thresholded, average='samples')
    print(f"–ü–æ—Ä–æ–≥ {threshold}: Jaccard = {jaccard_th:.4f}")

print("–°–û–•–†–ê–ù–ï–ù–ò–ï –ú–û–î–ï–õ–ò")

joblib.dump(vectorizer, 'actual_helpers/vectorizer_last.pkl')
joblib.dump(username_vectorizer, 'actual_helpers/username_vectorizer_last.pkl')
joblib.dump(time_encoder, 'actual_helpers/time_encoder_last.pkl')
joblib.dump(mlb_filtered, 'actual_helpers/mlb_filtered_last.pkl')
joblib.dump(model, 'model_last.pkl')
joblib.dump(scaler, 'actual_helpers/scaler_last.pkl')

print(f"–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –∫—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏–∏: Jaccard = {np.mean(fold_jaccard_scores):.4f} (¬±{np.std(fold_jaccard_scores):.4f})")
print(f"–†–µ–∑—É–ª—å—Ç–∞—Ç—ã –Ω–∞ —Ç–µ—Å—Ç–µ: Jaccard = {jaccard:.4f}")

–°—Ç–∞—Ç–µ–π –ø–æ—Å–ª–µ —Ñ–∏–ª—å—Ç—Ä–∞—Ü–∏–∏: 98061, —Ö–∞–±–æ–≤: 393
–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Ö–∞–±–æ–≤: {np.int64(3): 39006, np.int64(4): 29976, np.int64(5): 23474, np.int64(2): 5465, np.int64(1): 133, np.int64(6): 7}

–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...
–í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è –∑–∞ 33.22 —Å–µ–∫, X: (98035, 31555), y: (98035, 393)

–†–∞–∑–º–µ—Ä train: (78428, 31555), test: (19607, 31555)

–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...
–ö–†–û–°–°-–í–ê–õ–ò–î–ê–¶–ò–Ø (5 —Ñ–æ–ª–¥–æ–≤)

–§–æ–ª–¥ 1/5:
  Jaccard: 0.3858, F1 Micro: 0.5206, Hamming Loss: 0.0125

–§–æ–ª–¥ 2/5:
  Jaccard: 0.3843, F1 Micro: 0.5196, Hamming Loss: 0.0125

–§–æ–ª–¥ 3/5:
  Jaccard: 0.3816, F1 Micro: 0.5154, Hamming Loss: 0.0126

–§–æ–ª–¥ 4/5:
  Jaccard: 0.3841, F1 Micro: 0.5191, Hamming Loss: 0.0126

–§–æ–ª–¥ 5/5:
  Jaccard: 0.3818, F1 Micro: 0.5170, Hamming Loss: 0.0126
–†–ï–ó–£–õ–¨–¢–ê–¢–´ –ö–†–û–°–°-–í–ê–õ–ò–î–ê–¶–ò–ò
Jaccard Score (samples):
  –§–æ–ª–¥—ã: ['0.3858', '0.3843', '0.3816', '0.3841', '0.3818']
  –°—Ä–µ–¥–Ω–µ–µ: 0.3835 