# Описание задачи

Сообщества ВКонтакте могут принадлежать одной из нескольких заранее заданных категорий, среди спортивных сообществ есть достаточно сильное разделение по тематикам! Одни и те же авторы могут писать только об одном виде спорта или сразу о большом количестве. По заданному набору постов необходимо определить тематику — какой именно вид спорта обсуждается в выбранном сообществе.

Список доступных категорий:
1. athletics
2. autosport
3. basketball
4. boardgames
5. esport
6. extreme
7. football
8. hockey
9. martial_arts
10. motosport
11. tennis
12. volleyball
13. winter_sport

Нестандартная метрика:
* За каждый правильный ответ вы получите +1.
* За каждый неправильный ответ вы получите −1.

Можно отправлять решения, в которых категория отмечена не для каждого сообщества.

https://cups.online/ru/tasks/1417

 # Содержание:
 * [Импорт библиотек](#2-title)
 * [Загрузка данных](#3-title)
 * [Разделение на train и test](#4-title) 
 * [Обработка данных](#5-title)
 * [Обучение CatBoost](#6-title) 
 * [Предсказание](#7-title)

In [1]:
# если делаем в COLAB
COLAB = False

# Импорт библиотек
<a id='2-title'></a>

In [2]:
import pandas as pd
import numpy as np
# модели
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import log_loss
# время
from datetime import datetime
# работа с текстом
import re
import nltk
from pymystem3 import Mystem
m = Mystem()
from nltk.corpus import stopwords
stopwords = stopwords.words("russian")
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()
# отображение
pd.options.display.max_rows = 40
import warnings
warnings.filterwarnings('ignore')
# если делаем в colab подключим 
if COLAB:
    from google.colab import drive
    drive.mount('/content/drive')

In [3]:
# время запуска ноутбука
time_start_all = datetime.now()
print(f'Время запуска {time_start_all}')
# если делаем в COLAB то путь другой
if COLAB:
    path = '/content/drive/MyDrive/01_competions/2022_12_vk_cup_news_classification/input'
else:
    path = 'input/'

Время запуска 2023-02-20 01:54:24.145616


In [4]:
sports = ['athletics', 'autosport','basketball','boardgames','esport','extreme','football',
          'hockey','martial_arts','motosport','tennis','volleyball','winter_sport']

# Загрузка данных
<a id='3-title'></a>

In [5]:
%%time
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
sample_submission = pd.read_csv(path + 'sample_submission.csv')

CPU times: user 448 ms, sys: 29.5 ms, total: 478 ms
Wall time: 487 ms


In [6]:
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38740 entries, 0 to 38739
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   oid       38740 non-null  int64 
 1   category  38740 non-null  object
 2   text      38740 non-null  object
dtypes: int64(1), object(2)
memory usage: 908.1+ KB


Unnamed: 0,oid,category,text
0,365271984,winter_sport,Волшебные фото Виктория Поплавская ЕвгенияМедв...
1,503385563,extreme,Возвращение в подземелье Треша 33 Эйфория тупо...
2,146016084,football,Лучшие чешские вратари – Доминик Доминатор Гаш...
3,933865449,boardgames,Rtokenoid Warhammer40k валрак решил нас подкор...
4,713550145,hockey,Шестеркин затаскивает Рейнджерс в финал Восточ...


In [7]:
train['category'].value_counts()

autosport       3160
extreme         3110
martial_arts    3050
motosport       3030
boardgames      3020
tennis          3000
esport          2990
athletics       2970
hockey          2950
volleyball      2950
football        2860
basketball      2850
winter_sport    2800
Name: category, dtype: int64

In [8]:
test.info()
test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26260 entries, 0 to 26259
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   oid     26260 non-null  int64 
 1   text    26260 non-null  object
dtypes: int64(1), object(1)
memory usage: 410.4+ KB


Unnamed: 0,oid,text
0,749208109,СПОЧНО СООБЩЕСТВО ПРОДАЕТСЯ ЗА 1300Р ЗА ПОКУПК...
1,452466036,Естественное восстановление после тяжелой трен...
2,161038103,Тема нарядов продолжается Одна из британских ж...
3,663621910,Привет Избранный. Ты спрашиваешь себя ЧТО здес...
4,566255305,КОРОЛЬ ПЯТИСОТНИКОВ В ДЕЛЕ Андрей Рублев успеш...


In [9]:
SIZE_TRAIN = train.shape
SIZE_TEST = test.shape
print(f'Строк в train: {SIZE_TRAIN[0]}, столбцов: {SIZE_TRAIN[1]}. Строк в test: {SIZE_TEST[0]}, столбцов: {SIZE_TEST[1]}')

Строк в train: 38740, столбцов: 3. Строк в test: 26260, столбцов: 2


# Обработка данных
<a id='3-title'></a>

In [10]:
# в нижний регистр
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()
# переход на новую строку
train['text'] = train['text'].str.replace('\n',' ')
test['text'] = test['text'].str.replace('\n',' ')
# двойные пробелы
train['text'] = train['text'].str.replace('  ',' ')  
test['text'] = test['text'].str.replace('  ',' ')

In [11]:
# заменим возможноые символы на пустой символ, используем регулярки
train['text'] =  train['text'].apply(lambda x: re.sub(r'[.|,|?|!|\'|"|#|)|(|\|/|–]',r'', str(x)))
test['text'] =  test['text'].apply(lambda x: re.sub(r'[.|,|?|!|\'|"|#|)|(|\|/|–]',r'', str(x)))

In [12]:
def lemmatize(doc):
    '''
    лемма слов
    '''
    tokens = []
    for token in str(doc).split():
        if token not in stopwords:
            # удалим пробелы
            token = token.strip()
            # нормальной форма слова
            token = morph.normal_forms(token)[0]
            tokens.append(token)
    return ' '.join(tokens)

In [13]:
%%time
train['text'] = train['text'].apply(lemmatize)
test['text'] = test['text'].apply(lemmatize)

CPU times: user 2min 53s, sys: 313 ms, total: 2min 54s
Wall time: 2min 54s


In [14]:
# список для удаления
del_list = ['реклама']

In [15]:
# удалим строки со словами из del_list
test = test[~test['text'].str.contains('|'.join(del_list))].reset_index(drop=True)
train = train[~train['text'].str.contains('|'.join(del_list))].reset_index(drop=True)

In [16]:
# добавим столбцы с видом спорта
for sport in sports:
    # создадим стоблец классов и заполним его 1 для train
    train[sport] = 0
    train[sport] = np.where(train['category'] == sport, 1, 0)
    # для тест все заполним 0
    test[sport] = 0

# Разделение на train и test
<a id='5-title'></a>

In [17]:
X_train, X_val, y_train, y_val = train_test_split(pd.DataFrame(train['text']), train[train.columns[3:]], test_size=0.2)

# Обучение модели CatBoost
<a id='6-title'></a>

In [18]:
# текущее время
start_time = datetime.now()
print(f'Время запуска обучения {start_time}')
model = OneVsRestClassifier(estimator=CatBoostClassifier(iterations = 100, 
                                                         text_features=['text'],
                                                         depth=8,
                                                         random_state=42,
                                                         verbose = False, 
                                                         allow_writing_files=False))

X_train = pd.DataFrame(train['text'])
y_train = train[train.columns[3:]]
model.fit(X_train, y_train)
time_fit_CBR = datetime.now() - start_time
# время обучения
print(f'Время обучения {time_fit_CBR}')

Время запуска обучения 2023-02-20 01:57:19.860054
Время обучения 0:04:53.809151


# Предсказание
<a id='7-title'></a>

In [19]:
result_columns = ['oid'] + list(train.columns[3:])
result = pd.concat([test['oid'], pd.DataFrame(model.predict_proba(pd.DataFrame(test['text']))[:, :])], axis=1)
result.columns = result_columns

In [20]:
result.head()

Unnamed: 0,oid,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport
0,749208109,0.0086,0.242814,0.005969,0.005552,0.790736,0.204166,0.034019,0.01529,0.057105,0.009482,0.001583,0.031137,0.003776
1,452466036,0.422075,0.000825,0.002184,0.00044,0.003239,0.005186,0.003268,0.003405,0.116597,0.001037,0.001328,0.003021,0.002504
2,161038103,0.001535,0.002371,0.003055,0.000541,0.005797,0.008057,0.003839,0.00183,0.003743,0.001787,0.968961,0.002967,0.00276
3,663621910,0.003773,0.008893,0.002611,0.477045,0.636929,0.078541,0.004733,0.002228,0.002096,0.003355,0.002223,0.009809,0.005544
4,566255305,0.001709,0.001292,0.002727,0.000349,0.002287,0.00547,0.005013,0.003215,0.002721,0.001334,0.908595,0.006954,0.002158


In [21]:
# группируем по oid выбирая максмальную вероятность в группировке
submission = result.groupby('oid')[result_columns[1:]].max().rename_axis('oid').reset_index()
submission.head()

Unnamed: 0,oid,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport
0,1622114,0.965117,0.089316,0.042452,0.036038,0.118628,0.15336,0.072242,0.064647,0.123255,0.168502,0.011452,0.043777,0.050593
1,1663817,0.123397,0.996154,0.036676,0.070922,0.084052,0.146934,0.054854,0.218151,0.170665,0.190265,0.011452,0.01955,0.403692
2,3174332,0.00445,0.025565,0.984025,0.022653,0.044477,0.057994,0.070334,0.345951,0.05816,0.021503,0.010311,0.059267,0.015027
3,3469228,0.049705,0.056619,0.054402,0.833325,0.585422,0.820507,0.277808,0.052437,0.29579,0.02452,0.011452,0.020346,0.265751
4,3905302,0.117803,0.038636,0.034088,0.987753,0.059264,0.095679,0.157404,0.03479,0.167206,0.065435,0.011286,0.041205,0.503817


In [22]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2626 entries, 0 to 2625
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   oid           2626 non-null   int64  
 1   athletics     2626 non-null   float64
 2   autosport     2626 non-null   float64
 3   basketball    2626 non-null   float64
 4   boardgames    2626 non-null   float64
 5   esport        2626 non-null   float64
 6   extreme       2626 non-null   float64
 7   football      2626 non-null   float64
 8   hockey        2626 non-null   float64
 9   martial_arts  2626 non-null   float64
 10  motosport     2626 non-null   float64
 11  tennis        2626 non-null   float64
 12  volleyball    2626 non-null   float64
 13  winter_sport  2626 non-null   float64
dtypes: float64(13), int64(1)
memory usage: 287.3 KB


In [23]:
submission['category'] = submission[result_columns[1:]].idxmax(axis=1)

In [24]:
submission.head()

Unnamed: 0,oid,athletics,autosport,basketball,boardgames,esport,extreme,football,hockey,martial_arts,motosport,tennis,volleyball,winter_sport,category
0,1622114,0.965117,0.089316,0.042452,0.036038,0.118628,0.15336,0.072242,0.064647,0.123255,0.168502,0.011452,0.043777,0.050593,athletics
1,1663817,0.123397,0.996154,0.036676,0.070922,0.084052,0.146934,0.054854,0.218151,0.170665,0.190265,0.011452,0.01955,0.403692,autosport
2,3174332,0.00445,0.025565,0.984025,0.022653,0.044477,0.057994,0.070334,0.345951,0.05816,0.021503,0.010311,0.059267,0.015027,basketball
3,3469228,0.049705,0.056619,0.054402,0.833325,0.585422,0.820507,0.277808,0.052437,0.29579,0.02452,0.011452,0.020346,0.265751,boardgames
4,3905302,0.117803,0.038636,0.034088,0.987753,0.059264,0.095679,0.157404,0.03479,0.167206,0.065435,0.011286,0.041205,0.503817,boardgames


In [25]:
submission['category'].value_counts()

winter_sport    217
esport          215
basketball      211
volleyball      209
boardgames      204
motosport       204
hockey          203
tennis          203
athletics       202
football        201
martial_arts    200
autosport       182
extreme         175
Name: category, dtype: int64

In [26]:
# сохраняем результат
submission = submission[['oid', 'category']]
submission.to_csv('sub/output.csv', index=False)

# Время выполнения

In [27]:
time_finish_all = datetime.now() - time_start_all
# время обучения
print(f'Время выполнения {time_finish_all}')

Время выполнения 0:07:50.783163
