In [None]:

import wget

# Download the dataset
url = 'https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz'
wget.download(url)

In [43]:
import pandas as pd
import gzip

path = 'lenta-ru-news.csv.gz'

df = pd.read_csv(
    gzip.open(path, 'rt', encoding='utf-8'), 
    delimiter=',',
    quotechar='"'
)

df = df[['title', 'text', 'topic']].head(50000)
df.head(5)

class_counts = df['topic'].value_counts()
valid_classes = class_counts[class_counts > 500].index.to_list()
df = df[df['topic'].isin(valid_classes)]

In [44]:
import re
from natasha import Doc, Segmenter, MorphVocab
morph_vocab = MorphVocab()
segmenter = Segmenter()  

import re

def normalize(text):
    if not isinstance(text, str) or pd.isna(text) or text == "":
        return ""
        
    try:
        # Приведение к нижнему регистру
        text_lower = text.lower()
        
        # Удаление URL
        clean_text = re.sub(r'https?://\S+|www\.\S+', '', text_lower)
        
        # Удаление email
        clean_text = re.sub(r'\S+@\S+', '', clean_text)
        
        # Удаление всех чисел
        clean_text = re.sub(r'\d+', '', clean_text)
        
        # Удаление пунктуации
        clean_text = re.sub(r'[^\w\s]', '', clean_text)
        
        # Удаление множественных пробелов
        clean_text = re.sub(r'\s+', ' ', clean_text).strip()

        # Создание документа
        doc = Doc(clean_text)
        doc.segment(segmenter)
        
        lemmatized_tokens = []
        for token in doc.tokens:
            try:
                token.lemmatize(morph_vocab)
                if hasattr(token, 'lemma') and token.lemma:
                    lemmatized_tokens.append(token.lemma)
                else:
                    lemmatized_tokens.append(token.text)
            except AttributeError:
                lemmatized_tokens.append(token.text)
        
        # Собираем результат
        lemmatized_text = ' '.join(lemmatized_tokens)
        return lemmatized_text
        
    except Exception as e:
        print(f"Ошибка при обработке текста: {str(e)}")
        return clean_text

df['normalized_text'] = df['text'].apply(normalize)
df['normalized_title'] = df['title'].apply(normalize)
df['normalized_topic'] = df['topic'].apply(normalize)

Очистка текста:
Преобразование в нижний регистр
Удаление URL-адресов
Удаление email-адресов
Удаление чисел
Удаление знаков пунктуации
Нормализация пробелов


Лемматизация с использованием библиотеки Natasha:
Сегментация текста
Лемматизация каждого токена
Обработка ошибок для неизвестных слов (сохранение исходной формы)

Выбор Natasha вместо других инструментов:
Специализированная библиотека для русского языка

Почему регулярки вместо библиотек для отчистки? Не нашел нормальных clean-text удалял текст, а не чистил его.

In [45]:
from sklearn.calibration import LabelEncoder
from sklearn.model_selection import train_test_split

dataset = df[['normalized_text', 'normalized_title', 'normalized_topic']]

label_encoder = LabelEncoder()
encoded_topics = label_encoder.fit_transform(dataset['normalized_topic'])

y = encoded_topics
X = dataset['normalized_title'] + ' ' + dataset['normalized_text']

X_train, X_prep, y_train, y_prep  = train_test_split(X, y, test_size=0.4, random_state=1)

X_test, X_val, y_test, y_val = train_test_split(X_prep, y_prep, test_size=0.5, random_state=1) 

X

0        названы регионы россии с самой высокой смертно...
1        австрия не представила доказательств вины росс...
2        обнаружено самое счастливое место на планете с...
3        в сша раскрыли сумму расходов на расследование...
4        хакеры рассказали о планах великобритании зами...
                               ...                        
49995    в великобритании арестовали мужчину за секс с ...
49996    создан эффективный способ лечения смертоносных...
49997    защитник сборной россии спас ворота кельна в п...
49998    летний хипстерполицейский покорил instagram и ...
49999    кубинские власти назвали источник акустических...
Length: 49337, dtype: object

In [46]:
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

vectorizer = CountVectorizer(max_df=0.7, min_df=0.003)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

# Теперь размерности будут согласованы
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X_train_vectorized, y_train)
preds = dummy_clf.predict(X_val_vectorized)

print(classification_report(y_val, preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       610
           1       0.00      0.00      0.00       408
           2       0.00      0.00      0.00       682
           3       0.00      0.00      0.00       772
           4       0.00      0.00      0.00       667
           5       0.14      1.00      0.24      1342
           6       0.00      0.00      0.00       701
           7       0.00      0.00      0.00       238
           8       0.00      0.00      0.00      1381
           9       0.00      0.00      0.00       536
          10       0.00      0.00      0.00      1129
          11       0.00      0.00      0.00       427
          12       0.00      0.00      0.00       975

    accuracy                           0.14      9868
   macro avg       0.01      0.08      0.02      9868
weighted avg       0.02      0.14      0.03      9868



In [47]:
from sklearn.linear_model import LogisticRegression


logreg = LogisticRegression(max_iter=2000).fit(X_train_vectorized, y_train)
preds = logreg.predict(X_val_vectorized)
print(classification_report(y_val, preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.82      0.79      0.81       610
           1       0.88      0.83      0.85       408
           2       0.82      0.82      0.82       682
           3       0.80      0.79      0.79       772
           4       0.89      0.88      0.89       667
           5       0.82      0.85      0.83      1342
           6       0.86      0.86      0.86       701
           7       0.79      0.74      0.76       238
           8       0.75      0.78      0.77      1381
           9       0.78      0.76      0.77       536
          10       0.96      0.97      0.97      1129
          11       0.91      0.85      0.88       427
          12       0.84      0.83      0.84       975

    accuracy                           0.84      9868
   macro avg       0.84      0.83      0.83      9868
weighted avg       0.84      0.84      0.84      9868



In [52]:
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report   
from sklearn.pipeline import Pipeline

pipe = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=2000))
    ]
).fit(X_train, y_train)

preds = pipe.predict(X_val)

print(classification_report(y_val, preds, zero_division=0))

              precision    recall  f1-score   support

           0       0.86      0.82      0.84       610
           1       0.92      0.78      0.85       408
           2       0.81      0.85      0.83       682
           3       0.83      0.81      0.82       772
           4       0.90      0.90      0.90       667
           5       0.82      0.90      0.86      1342
           6       0.91      0.87      0.89       701
           7       0.89      0.58      0.70       238
           8       0.77      0.83      0.80      1381
           9       0.83      0.75      0.79       536
          10       0.96      0.97      0.96      1129
          11       0.95      0.82      0.88       427
          12       0.85      0.88      0.87       975

    accuracy                           0.85      9868
   macro avg       0.87      0.83      0.84      9868
weighted avg       0.86      0.85      0.85      9868



In [None]:
from sklearn import multiclass
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, make_scorer, f1_score
import numpy as np
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.9, min_df=0.001, norm='l1', )),
    ('classifier', LogisticRegression(random_state=1, max_iter=2000, multi_class='ovr'))
])

f1_macro_scorer = make_scorer(f1_score, average='macro')


tfidf_params = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__max_features': [10000, 20000, None],
}

# Параметры для логистической регрессии (используем лучшие найденные)
tfidf_logistic_params = {
     'classifier__C': [0.1, 1.0, 10.0],
     'classifier__class_weight': [None, 'balanced'],

}

# Объединяем параметры
tfidf_param_grid = {}
tfidf_param_grid.update(tfidf_params)
tfidf_param_grid.update(tfidf_logistic_params)

# Создаем GridSearchCV для TfidfVectorizer
tfidf_grid_search = GridSearchCV(
    tfidf_pipeline,
    param_grid=tfidf_param_grid,
    scoring=f1_macro_scorer,
    n_jobs=-1,
    verbose=2,
    cv=3
)

# Запускаем поиск
tfidf_grid_search.fit(X_train, y_train)

# Выводим лучшие параметры
print("\nЛучшие параметры для TfidfVectorizer:")
print(tfidf_grid_search.best_params_)
print(f"\nЛучший f1-macro на валидации: {tfidf_grid_search.best_score_:.4f}")

# Оцениваем на тестовой выборке
best_tfidf_model = tfidf_grid_search.best_estimator_
y_tfidf_pred = best_tfidf_model.predict(X_test)

# Выводим метрики на тестовой выборке
print("\nРезультаты TfidfVectorizer на тестовой выборке:")
print(f"Accuracy: {accuracy_score(y_test, y_tfidf_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_tfidf_pred, zero_division=0))


Fitting 3 folds for each of 36 candidates, totalling 108 fits




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.1s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.1s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.3s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.4s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.1s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.2s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.9s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.1s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  19.1s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  20.0s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  19.7s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   4.7s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  18.2s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.2s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.6s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.6s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   7.4s




[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.9s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.8s
[CV] END classifier__C=0.1, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.8s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.8s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.3s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   4.4s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   4.7s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   4.3s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   4.3s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  16.0s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  15.6s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  16.4s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   3.9s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  15.9s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  17.5s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.0s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   9.4s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.8s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   9.4s




[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   9.7s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.6s
[CV] END classifier__C=0.1, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.2s




[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.4s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.4s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.3s




[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.8s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.7s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  16.5s




[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  16.7s




[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  19.5s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.2s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  17.7s




[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  18.6s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.6s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.0s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.1s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.0s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.8s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   5.4s
[CV] END classifier__C=1.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  10.7s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.2s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.6s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.8s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   7.8s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  20.1s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   7.3s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  18.7s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  20.5s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   4.8s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.2s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.1s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.6s




[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  10.7s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  10.7s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.4s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.6s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  22.6s
[CV] END classifier__C=1.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  10.2s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.1s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   7.0s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   7.2s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  18.8s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   6.1s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.8s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  16.7s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  18.4s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   6.3s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  18.6s




[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  20.2s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.9s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  18.6s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=   6.1s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  12.1s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.9s
[CV] END classifier__C=10.0, classifier__class_weight=None, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=  11.5s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total tim



[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.2s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   5.4s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 1); total time=   6.6s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  15.6s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   6.1s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   6.2s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  18.3s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  18.4s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 1); total time=   5.8s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  17.1s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  19.8s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=20000, vectorizer__ngram_range=(1, 2); total time=  16.0s




[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   9.3s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   9.0s
[CV] END classifier__C=10.0, classifier__class_weight=balanced, vectorizer__max_features=None, vectorizer__ngram_range=(1, 2); total time=   8.3s





Лучшие параметры для TfidfVectorizer:
{'classifier__C': 10.0, 'classifier__class_weight': 'balanced', 'vectorizer__max_features': 10000, 'vectorizer__ngram_range': (1, 1)}

Лучший f1-macro на валидации: 0.8117

Результаты TfidfVectorizer на тестовой выборке:
Accuracy: 0.8250

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.85      0.83       655
           1       0.83      0.83      0.83       397
           2       0.71      0.86      0.78       614
           3       0.78      0.75      0.77       767
           4       0.90      0.88      0.89       650
           5       0.84      0.83      0.83      1380
           6       0.90      0.89      0.89       738
           7       0.70      0.75      0.73       243
           8       0.82      0.68      0.74      1428
           9       0.64      0.85      0.73       539
          10       0.94      0.95      0.94      1029
          11       0.90      0.83      0.87       

In [53]:
tfidf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_df=0.9, min_df=0.001, norm='l1', ngram_range=(1, 1), max_features=10000)),
    ('classifier', LogisticRegression(random_state=1, max_iter=2000, multi_class='ovr', C=10.0, class_weight='balanced'))
]).fit(X_train, y_train)


preds = tfidf_pipeline.predict(X_test)
print(classification_report(y_test, preds, zero_division=0))



              precision    recall  f1-score   support

           0       0.82      0.85      0.83       655
           1       0.83      0.83      0.83       397
           2       0.71      0.86      0.78       614
           3       0.78      0.75      0.77       767
           4       0.90      0.88      0.89       650
           5       0.84      0.83      0.83      1380
           6       0.90      0.89      0.89       738
           7       0.70      0.75      0.73       243
           8       0.82      0.68      0.74      1428
           9       0.64      0.85      0.73       539
          10       0.94      0.95      0.94      1029
          11       0.90      0.83      0.87       406
          12       0.84      0.85      0.84      1021

    accuracy                           0.82      9867
   macro avg       0.82      0.83      0.82      9867
weighted avg       0.83      0.82      0.83      9867

