In [1]:
import pandas as pd
import torch
import numpy as np
from tqdm.notebook import tqdm
from torch import nn
from transformers import AutoModel, AutoTokenizer

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

  from pandas.core import (


There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1650 Ti


In [2]:
df_train = pd.read_csv('data/train_row.csv')
df_test = pd.read_csv('data/test_row.csv')

In [3]:
class TextEmbeddings:
    def __init__(self, add_cls_embeddings=True, add_mean_embeddings=False):
        self.add_mean_embeddings = add_mean_embeddings
        self.add_cls_embeddings = add_cls_embeddings
        if add_cls_embeddings is False and add_mean_embeddings is False:
            raise 'Error: you should select at least one type of embeddings to be computed'

    def mean_pooling(self, hidden_state, attention_mask):
        """
        Возвращает усредненный с учетом attention_mask hidden_state.
        """
        token_embeddings = hidden_state.detach().cpu()
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        return sum_embeddings / attention_mask.sum()

    def extract_embeddings(self, texts, model_name, max_len):
        """
        Возвращает значения посчитанные данной моделью эмбеддинги для всех текстов из texts.
        """
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name).cuda()
        text_features = []
        for sentence in tqdm(texts):
            encoded_input = tokenizer([sentence],
                                      padding='max_length',
                                      truncation=True,
                                      max_length=max_len,
                                      return_tensors='pt')
            with torch.no_grad():
                hidden_state, cls_head = model(input_ids=encoded_input['input_ids'].cuda(), return_dict=False)
                sentence_embeddings = self.mean_pooling(hidden_state, encoded_input['attention_mask'])

            now_emb = []
            if self.add_cls_embeddings:
                now_emb.append(cls_head.detach().cpu().numpy().flatten())

            if self.add_mean_embeddings:
                now_emb.append(sentence_embeddings.detach().cpu().numpy().flatten())

            text_features.append(np.concatenate(now_emb, axis=0))
        return text_features

    def add_many_embeddings(self, df, text_col, models):
        """"
        Добавляет в качестве признаков эмбеддинги для колонки text_col.
        В качестве моделей и максимальных длинн используются models.
        """
        for model_name, max_len in models:
            print(model_name)
            text_features = self.extract_embeddings(df[text_col], model_name, max_len)
            text_features_df = pd.DataFrame(text_features, columns = [f'{model_name}_{text_col}_feature_{i}' for i in range(len(text_features[0]))])
            df = df.join(text_features_df)
            df.to_csv('data/transformers_text_features.csv', index=False)
        return df

In [4]:
models = [
          ('cointegrated/LaBSE-en-ru', 512),
          ('DeepPavlov/rubert-base-cased-conversational', 512)
]

In [5]:
text_embeddings = TextEmbeddings(True, True)
data = text_embeddings.add_many_embeddings(df_train, 'text', models)

cointegrated/LaBSE-en-ru


Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/806 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/521k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/516M [00:00<?, ?B/s]

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/7500 [00:00<?, ?it/s]

DeepPavlov/rubert-base-cased-conversational


Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/7500 [00:00<?, ?it/s]

In [7]:
data.head()

Unnamed: 0,category,text,cointegrated/LaBSE-en-ru_text_feature_0,cointegrated/LaBSE-en-ru_text_feature_1,cointegrated/LaBSE-en-ru_text_feature_2,cointegrated/LaBSE-en-ru_text_feature_3,cointegrated/LaBSE-en-ru_text_feature_4,cointegrated/LaBSE-en-ru_text_feature_5,cointegrated/LaBSE-en-ru_text_feature_6,cointegrated/LaBSE-en-ru_text_feature_7,...,DeepPavlov/rubert-base-cased-conversational_text_feature_1526,DeepPavlov/rubert-base-cased-conversational_text_feature_1527,DeepPavlov/rubert-base-cased-conversational_text_feature_1528,DeepPavlov/rubert-base-cased-conversational_text_feature_1529,DeepPavlov/rubert-base-cased-conversational_text_feature_1530,DeepPavlov/rubert-base-cased-conversational_text_feature_1531,DeepPavlov/rubert-base-cased-conversational_text_feature_1532,DeepPavlov/rubert-base-cased-conversational_text_feature_1533,DeepPavlov/rubert-base-cased-conversational_text_feature_1534,DeepPavlov/rubert-base-cased-conversational_text_feature_1535
0,extreme,Ледник Пасторури это цирковой ледник расположе...,-0.076881,-0.412762,-0.056001,0.046433,-0.449512,-0.253624,-0.301378,-0.103132,...,-0.783512,0.589172,-0.022016,-0.231998,0.875344,-0.112135,-0.048335,0.161959,0.981492,-0.298607
1,martial_arts,Главные участники предстоящего Betokenoid 274 ...,-0.046052,-0.103519,-0.111545,-0.114912,0.039759,-0.388762,-0.350189,-0.019205,...,-0.662196,0.968559,-0.271811,0.215214,1.008042,-0.207437,0.750243,0.188331,0.905672,-0.198813
2,extreme,Ttokenoid Btokenoid – карта с которой можно не...,0.070633,-0.161732,0.134272,0.207949,0.076964,0.0452,-0.030984,-0.070647,...,-0.759897,0.173297,-0.612716,-0.515749,0.493288,-0.486974,0.029368,1.051441,0.681879,-1.202563
3,autosport,В Сильверстоуне произошли крупные обновления а...,-0.181577,-0.094993,0.290002,0.184838,0.076357,-0.125468,0.171144,-0.018427,...,-0.697004,0.126595,-0.83832,-0.295631,1.008092,0.097092,-0.005237,0.303586,0.78849,-0.606932
4,extreme,На протяжении более чем 30 лет Вестсайд являет...,-0.077018,-0.28056,0.1474,0.345588,-0.185279,-0.234,-0.023668,-0.418108,...,-0.526097,0.874269,-0.634377,0.069256,0.950486,-0.208337,0.473491,0.558341,1.220464,-1.187832


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import catboost as cb
from catboost import CatBoostClassifier, Pool

X = data.drop(['category', 'text'], axis=1)
y = data['category'].astype('category')
num_classes = y.nunique()
cat_features = ['category']

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42)

train_pool = cb.Pool(X_train, y_train)
test_pool = cb.Pool(X_test, y_test)

model = cb.CatBoostClassifier(
    iterations=1000,
    depth=6,
    loss_function='MultiClass',
    classes_count=num_classes,
    verbose=100,
    task_type="GPU",
    devices='0'
)
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100)




Learning rate set to 0.126016
0:	learn: 2.4603198	test: 2.4687300	best: 2.4687300 (0)	total: 424ms	remaining: 7m 3s
100:	learn: 0.7353239	test: 1.2503710	best: 1.2503710 (100)	total: 32.5s	remaining: 4m 49s
200:	learn: 0.4652716	test: 1.1264506	best: 1.1264506 (200)	total: 1m 1s	remaining: 4m 2s
300:	learn: 0.3300544	test: 1.0686409	best: 1.0686409 (300)	total: 1m 28s	remaining: 3m 25s
400:	learn: 0.2534932	test: 1.0330588	best: 1.0330588 (400)	total: 1m 54s	remaining: 2m 50s
500:	learn: 0.2031064	test: 1.0082665	best: 1.0082665 (500)	total: 2m 20s	remaining: 2m 19s
600:	learn: 0.1652963	test: 0.9875378	best: 0.9875378 (600)	total: 2m 45s	remaining: 1m 50s
700:	learn: 0.1376393	test: 0.9718162	best: 0.9718162 (700)	total: 3m 11s	remaining: 1m 21s
800:	learn: 0.1151603	test: 0.9596920	best: 0.9593813 (799)	total: 3m 37s	remaining: 54s
900:	learn: 0.0987487	test: 0.9519076	best: 0.9519076 (900)	total: 4m 3s	remaining: 26.7s
999:	learn: 0.0850986	test: 0.9453007	best: 0.9453007 (999)	tota

<catboost.core.CatBoostClassifier at 0x17de21e6560>

In [9]:
model.best_score_

{'learn': {'MultiClass': 0.08509856160481771},
 'validation': {'MultiClass': 0.9453006998697917}}

In [10]:
feature_importances = model.get_feature_importance(data=train_pool, type=cb.EFstrType.PredictionValuesChange)

feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

feature_importances_df

Unnamed: 0,Feature,Importance
1483,cointegrated/LaBSE-en-ru_text_feature_1483,1.107499
1446,cointegrated/LaBSE-en-ru_text_feature_1446,1.080637
862,cointegrated/LaBSE-en-ru_text_feature_862,0.955099
958,cointegrated/LaBSE-en-ru_text_feature_958,0.943007
1284,cointegrated/LaBSE-en-ru_text_feature_1284,0.939991
...,...,...
2080,DeepPavlov/rubert-base-cased-conversational_te...,0.000000
2079,DeepPavlov/rubert-base-cased-conversational_te...,0.000000
2078,DeepPavlov/rubert-base-cased-conversational_te...,0.000000
2076,DeepPavlov/rubert-base-cased-conversational_te...,0.000000


In [20]:
drop_feature_importances_df = feature_importances_df[feature_importances_df['Importance'] < 0.02]
drop_feature_importances_df.shape

(2165, 2)

In [21]:
train_cleaned = data.drop(drop_feature_importances_df['Feature'].to_list(), axis=1)

train_cleaned.to_csv('data/transformers_text_features.csv')