In [3]:
!pip install pandas scikit-learn

Collecting pandas
  Downloading pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl.metadata (89 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (13 kB)
Collecting numpy>=1.23.2 (from pandas)
  Downloading numpy-2.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (62 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.2.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl (15.6 MB)
[2K   [90m━━━━━━━━

In [4]:
import pandas as pd

In [5]:
!du -hs data/*

77M	data/test_news.csv
177M	data/train_news.csv
252M	data/train_raw


In [28]:
train = pd.read_csv('data/train_news.csv')
train.shape

((77500, 3), (26275, 1))

# EDA

In [14]:
train[['target', 'news_topic']].value_counts()

target  news_topic        
1       Общество/Россия       10000
3       Бывший СССР           10000
5       Наука и техника       10000
8       Спорт                 10000
37      Силовые структуры     10000
48      Туризм/Путешествия    10000
87      Забота о себе         10000
4       Экономика              7500
Name: count, dtype: int64

In [34]:
train.apply(lambda x: 'фото:' in x.text.lower(), axis=1).value_counts()

True     72314
False     5186
Name: count, dtype: int64

In [35]:
train.apply(lambda x:  x.text.lower().startswith('фото:'), axis=1).value_counts()

True     67049
False    10451
Name: count, dtype: int64

In [53]:
train.text.str.len().groupby([train.news_topic, train.target]).agg(['mean', 'max', 'min']).round().sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,max,min
news_topic,target,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Забота о себе,87,1557.0,8916,621
Экономика,4,1446.0,8943,267
Наука и техника,5,1329.0,7814,361
Общество/Россия,1,1247.0,11641,292
Туризм/Путешествия,48,1133.0,8639,329
Силовые структуры,37,1025.0,6438,289
Бывший СССР,3,988.0,7922,267
Спорт,8,964.0,5550,447


In [54]:
train.isna().sum()

target        0
news_topic    0
text          0
dtype: int64

# Clf

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
rstate = 0

In [18]:
X, y = train.text, train.target
X.shape, y.shape

((77500,), (77500,))

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=rstate, stratify=y)
X_train.shape, X_test.shape

((62000,), (15500,))

# Bag of words

In [20]:
%%time
bow_proc_pipe = Pipeline(
    steps=[
        ('bow', CountVectorizer()),
        ('scaler', MaxAbsScaler()),
    ]
)
bow_proc_pipe.fit(X_train)
bow_x_train, bow_x_test = bow_proc_pipe.transform(X_train), bow_proc_pipe.transform(X_test)

CPU times: user 14.2 s, sys: 33.2 ms, total: 14.3 s
Wall time: 14.3 s


In [25]:
%%time
log_reg_clf = LogisticRegression(max_iter=200, random_state=42)
log_reg_clf.fit(bow_x_train, y_train)
log_reg_pred_test = log_reg_clf.predict(bow_x_test)
log_reg_pred_train = log_reg_clf.predict(bow_x_train)
print('Test\n', classification_report(y_test, log_reg_pred_test))
print('Train\n', classification_report(y_train, log_reg_pred_train))

Test
               precision    recall  f1-score   support

           1       0.90      0.90      0.90      2000
           3       0.94      0.95      0.95      2000
           4       0.95      0.95      0.95      1500
           5       0.98      0.97      0.98      2000
           8       1.00      0.99      0.99      2000
          37       0.97      0.97      0.97      2000
          48       0.98      0.98      0.98      2000
          87       0.99      0.99      0.99      2000

    accuracy                           0.96     15500
   macro avg       0.96      0.96      0.96     15500
weighted avg       0.96      0.96      0.96     15500

Train
               precision    recall  f1-score   support

           1       1.00      1.00      1.00      8000
           3       1.00      1.00      1.00      8000
           4       1.00      1.00      1.00      6000
           5       1.00      1.00      1.00      8000
           8       1.00      1.00      1.00      8000
          3

## Score kaggle test dataset

In [40]:
%%time
log_reg_bow_test = pd.read_csv('data/test_news.csv')
log_reg_bow_test_proc = bow_proc_pipe.transform(log_reg_bow_test.content)
test_pred = log_reg_clf.predict(log_reg_bow_test_proc)
log_reg_bow_test['pred'] = test_pred
log_reg_bow_test

CPU times: user 4.04 s, sys: 64.9 ms, total: 4.11 s
Wall time: 4.14 s


Unnamed: 0,content,pred
0,Фото: «Фонтанка.ру»ПоделитьсяЭкс-министру обор...,48
1,В начале февраля 2023 года в Пушкинском районе...,4
2,Фото: Andy Bao / Getty Images Анастасия Борисо...,8
3,"Если вы хотели, но так и не съездили на море л...",48
4,Сергей Пиняев Фото: Алексей Филиппов / РИА Нов...,8
...,...,...
26270,Фото: РИА Новости Алевтина Запольская Главное ...,3
26271,Вадим Гутцайт Фото: Sergei CHUZAVKOV / Europea...,3
26272,Фото: Олег Харсеев / Коммерсантъ Александр Кур...,37
26273,Владимир Зеленский Фото: Yves Herman / Reuters...,3


In [44]:
tags_map = {1 : 0, 3 : 3, 4 : 1, 5 : 8, 8 : 4, 37 : 2, 48 : 7, 87 : 5}
log_reg_bow_test['pred'] = log_reg_bow_test.pred.map(tags_map)
log_reg_bow_test.pred.value_counts().sort_index()

pred
0    7424
1    5176
2    2715
3    3602
4    2736
5     906
7    1777
8    1939
Name: count, dtype: int64

In [48]:
log_reg_bow_sub = log_reg_bow_test.reset_index().rename(columns={'pred': 'topic'})[['topic', 'index']]
log_reg_bow_sub.to_csv('data/results/log_reg_bow.csv')
!du -hs 'data/results/log_reg_bow.csv'

340K	data/results/log_reg_bow.csv


In [49]:
log_reg_bow_sub.shape

(26275, 2)

In [52]:
t = pd.read_csv('data/base_submission_news.csv')
t['topic'] = log_reg_bow_sub.topic
t.to_csv('data/results/log_reg_bow.csv')
!du -hs 'data/results/log_reg_bow.csv'

388K	data/results/log_reg_bow.csv
