In [1]:
!wget https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz

--2025-03-13 20:13:46--  https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/download/v1.0/lenta-ru-news.csv.gz
Resolving github.com (github.com)... 140.82.121.3
connected. to github.com (github.com)|140.82.121.3|:443... 
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250313%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250313T171346Z&X-Amz-Expires=300&X-Amz-Signature=72257e24c0e147392a9532f69513090b73963a964f572c40fc3f1917cc6e2b51&X-Amz-SignedHeaders=host&response-content-disposition=attachment%3B%20filename%3Dlenta-ru-news.csv.gz&response-content-type=application%2Foctet-stream [following]
--2025-03-13 20:13:46--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/87156914/0b363e00-0126-11e9-9e3c-e8c235463bd6?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-A

In [2]:
!pip install pandas nltk scikit-learn corus legacy-cgi pymorphy3



In [3]:
import re

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split

from corus import load_lenta

import warnings
warnings.filterwarnings('ignore')

DATA_PATH = 'lenta-ru-news.csv.gz'

SIZE_LIMIT = 150_000
TOPIC_SIZE_THRESHOLD = 10

CORPUS = [
    {'text': row.title + '. ' + row.text, 'topic': row.topic}
    for row in load_lenta(DATA_PATH)
]

df = pd.DataFrame(CORPUS)
df = df.sample(n=SIZE_LIMIT, random_state=123).reset_index(drop=True)

assert len(df) == SIZE_LIMIT

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

STOP_WORDS = set(stopwords.words('russian'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/lulchak-
[nltk_data]     pavel/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
import pymorphy3
from tqdm import tqdm

morph = pymorphy3.MorphAnalyzer()

pattern = re.compile(r'\W')

lemma_cache = {}

def preprocess_text(text: str):
    text = text.lower()
    text = pattern.sub(' ', text)
    
    words = text.split()
    lemmatized_words = []
    for word in words:
        if word not in STOP_WORDS:
            if word not in lemma_cache:
                lemma_cache[word] = morph.parse(word)[0].normal_form
            lemmatized_words.append(lemma_cache[word])
    
    return ' '.join(lemmatized_words)

tqdm.pandas(
    desc='Preprocessing',
    bar_format='{l_bar}{bar:15}{r_bar}',
    ncols=80,
)

df['text'] = df['text'].progress_apply(preprocess_text)

topic_counts = df['topic'].value_counts()
other_topics = topic_counts[topic_counts < TOPIC_SIZE_THRESHOLD].index

df['topic'] = df['topic'].apply(lambda x: 'Other' if x in other_topics else x)
df['topic'] = df['topic'].astype('category').cat.codes

df.head(3)

Preprocessing: 100%|███████████████| 150000/150000 [00:31<00:00, 4745.59it/s]


Unnamed: 0,text,topic
0,туляк дать полтора год тюрьма экстремизм вконт...,7
1,microsoft google готовый уладить дело кайф аме...,7
2,киев митинговать ворваться здание минюст понед...,4


In [5]:
topic_counts[topic_counts < TOPIC_SIZE_THRESHOLD]

topic
Библиотека    9
ЧМ-2014       1
Оружие        1
Name: count, dtype: int64

**Пайплайн предобработки**:

1. Приводим текст к единому регистру + удаляем небуквенные символы
2. Лемматизируем исходный текст, так как хотим учитывать контекст при обработке
3. Топики привел целочисленным значениям и объеденил редкие топики в единый топик Other (всего 11 записей, кажется ок)

In [6]:
X = df['text']
y = df['topic']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [7]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report, f1_score

dummy_clf = DummyClassifier(strategy='stratified', random_state=42)
dummy_clf.fit(X_train, y_train)

y_pred = dummy_clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(f'F1-score: {f1_score(y_test, y_pred, average="weighted"):.4f}')

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.00      0.00      0.00        46
           2       0.00      0.00      0.00         2
           3       0.01      0.01      0.01       298
           4       0.07      0.07      0.07      2153
           5       0.02      0.02      0.02       901
           6       0.03      0.03      0.03      1109
           7       0.06      0.06      0.06      1803
           8       0.00      0.00      0.00        27
           9       0.00      0.00      0.00        15
          10       0.08      0.08      0.08      2180
          11       0.00      0.00      0.00         6
          12       0.18      0.18      0.18      5587
          13       0.08      0.08      0.08      2147
          14       0.00      0.00      0.00       259
          15       0.21      0.21      0.21      6531
          16       0.02      0.02      0.02       784
          17       0.08    

Далеко не то качество, которое хочется получить, попробуем обучить лог. регрессию на векторизованных текстах

In [8]:
from typing import Union, List

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def fit_predict_logreg(
    vectorizer: Union[CountVectorizer, TfidfVectorizer],
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
) -> List[int]:
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    model = LogisticRegression(max_iter=5_000, random_state=42)
    model.fit(X_train, y_train)

    return model.predict(X_test)
   
f1_count = f1_score(y_test, fit_predict_logreg(CountVectorizer(), X_train.copy(), y_train.copy(), X_test.copy()), average='weighted')
print(f'F1-score with CountVectorizer: {f1_count:.4f}')

f1_tfidf = f1_score(y_test, fit_predict_logreg(TfidfVectorizer(), X_train.copy(), y_train.copy(), X_test.copy()), average='weighted')
print(f'F1-score with TfidfVectorizer: {f1_tfidf:.4f}')

F1-score with CountVectorizer: 0.8059
F1-score with TfidfVectorizer: 0.8128


Взвешенный F1 скор выглядит приемлемо, попробуем подобрать параметры на кросс-валидации для TF-IDF эмбеддера. Попробуем потюнить TF-IDF: скор выше + для задачи важно понимать редко встречающиеся аббривеатуры и другие специфичные для новостей тексты

In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer

pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=10_000)),
    ('classifier', LogisticRegression(max_iter=5_000, random_state=42))
])

parameters = {
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'vectorizer__min_df': [1, 5],
    'classifier__C': [0.1, 0.5, 1],
    'classifier__solver': ['saga']
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=3, verbose=10, scoring=make_scorer(f1_score, average='weighted'))
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

y_pred_best = grid_search.predict(X_test)
f1_best = f1_score(y_test, y_pred_best, average='weighted')
print(f"Optimized F1-score: {f1_best:.4f}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3; 1/12] START classifier__C=0.1, classifier__solver=saga, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1)
[CV 1/3; 1/12] END classifier__C=0.1, classifier__solver=saga, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1);, score=0.730 total time=  14.6s
[CV 1/3; 5/12] START classifier__C=0.5, classifier__solver=saga, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1)
[CV 1/3; 5/12] END classifier__C=0.5, classifier__solver=saga, vectorizer__min_df=1, vectorizer__ngram_range=(1, 1);, score=0.789 total time=  16.9s
[CV 1/3; 7/12] START classifier__C=0.5, classifier__solver=saga, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1)
[CV 1/3; 7/12] END classifier__C=0.5, classifier__solver=saga, vectorizer__min_df=5, vectorizer__ngram_range=(1, 1);, score=0.789 total time=  16.6s
[CV 1/3; 10/12] START classifier__C=1, classifier__solver=saga, vectorizer__min_df=1, vectorizer__ngram_range=(1, 2)
[CV 1/3; 10/12] END c

In [10]:
from sklearn.metrics import classification_report, f1_score

y_pred_val = grid_search.predict(X_val)

report = classification_report(y_val, y_pred_val)

f1_test = f1_score(y_val, y_pred_val, average='weighted')

print(report)
print(f"Optimized Weighted F1-score: {f1_test:.4f}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       1.00      0.06      0.12        47
           2       0.00      0.00      0.00         2
           3       0.71      0.26      0.38       298
           4       0.83      0.82      0.82      2153
           5       0.83      0.79      0.81       901
           6       0.68      0.57      0.62      1110
           7       0.77      0.71      0.74      1803
           8       0.67      0.07      0.13        27
           9       0.00      0.00      0.00        14
          10       0.87      0.88      0.88      2179
          11       0.00      0.00      0.00         5
          12       0.79      0.86      0.82      5587
          13       0.83      0.86      0.85      2147
          14       0.85      0.56      0.67       260
          15       0.79      0.84      0.81      6531
          16       0.77      0.46      0.58       783
          17       0.96    

Видно, что алгоритм хорошо выучил содержательные категории, но на малых данных видно проблемы в качестве, кажется, обучение на больших данных даст лучший результат