In [1]:
!pip install transformers -q
!pip install catboost -q 
!pip install pymorphy2 -q

In [12]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tqdm import tqdm
import torch 
from catboost import CatBoostClassifier, CatBoostRegressor
import datetime
from sklearn.model_selection import KFold, StratifiedGroupKFold
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_absolute_percentage_error
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
import torch.nn as nn
import pymorphy2
from collections import defaultdict
import nltk
tqdm.pandas()
# nltk.download('punkt')
from string import punctuation
from torch.utils.data import Dataset, DataLoader
from google.colab import files


In [32]:
main_path = 'train.csv'
extractor_SBERT = 'sberbank-ai/sbert_large_mt_nlu_ru'
extractor_LaBSE = 'cointegrated/LaBSE-en-ru'
output_name_SBERT_content = f"embeddings/train_SBERT_content"
output_name_LABSE_content = f"embeddings/train_LaBSE_content"
output_name_SBERT_solution = f"embeddings/train_SBERT_solution"
output_name_LABSE_solution = f"embeddings/train_LaBSE_solution"
train = pd.read_csv(main_path)
train = train.reset_index(drop=True)

In [4]:
class EmbeddingDataset(Dataset):
    def __init__(self, data):
        super().__init__()
        self.data = data

    def __getitem__(self, idx):
        text = self.data[idx]
        return text

    def __len__(self):
        return len(self.data)

def collate_fn(batch):
    model_input = []
    for text in batch:
        model_input.append(text)

    tok = tokenizer(model_input, padding=True,
                    max_length=128, truncation=True,
                    return_tensors='pt')
    return tok

In [5]:
def get_loader(dataset, shuffle, batch_size):
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        pin_memory=False,
        num_workers=8,
        collate_fn=collate_fn
    )
    return loader

In [6]:
def make_features_transformers(df, model_name, df_model, col, max_len, output_name):
    global tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).cuda()
    
    batch_size = 64
    text_emb_dataset = EmbeddingDataset(df[col].to_list())
    text_emb_loader = get_loader(text_emb_dataset, shuffle=False, batch_size=batch_size)
    
    text_features = []
    
    model.eval()
    with torch.no_grad():
        for batch in tqdm(text_emb_loader):
            model_output = model(**batch.to("cuda"))
            text_features.extend(model_output[1].detach().cpu().numpy())
    
    np.save(output_name, np.array(text_features))
    text_features_df = pd.DataFrame(text_features, columns = [f'{df_model}_{col}_feature_{i}' for i in range(len(text_features[0]))])
    return text_features_df

In [7]:
train.fillna("Ничего", inplace=True)

In [10]:
train_SBERT_content = train.join(make_features_transformers(df=train, model_name=extractor_SBERT, df_model=extractor_SBERT,
                                              col = 'Содержание', max_len=128, output_name=output_name_SBERT_content))

100%|██████████| 232/232 [01:13<00:00,  3.15it/s]


In [13]:
files.download("/content/embeddings/train_SBERT_content.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
train_SBERT_solution = train.join(make_features_transformers(df=train, model_name=extractor_SBERT, df_model=extractor_SBERT,
                                              col = 'Решение             ', max_len=2048, output_name=output_name_SBERT_solution))

100%|██████████| 232/232 [04:02<00:00,  1.04s/it]


In [19]:
files.download("/content/embeddings/train_SBERT_solution.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [33]:
train.fillna("Ничего", inplace = True)

In [29]:
train_LABSE_content = train.join(make_features_transformers(df=train, model_name=extractor_LaBSE, df_model=extractor_LaBSE,
                                              col = 'Содержание', max_len=128, output_name=output_name_LABSE_content))

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 232/232 [00:24<00:00,  9.65it/s]


In [30]:
files.download("/content/embeddings/train_LaBSE_content.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
train_LABSE_solution = train.join(make_features_transformers(df=train, model_name=extractor_LaBSE, df_model=extractor_LaBSE,
                                              col = 'Решение             ', max_len=2048, output_name=output_name_LABSE_solution))

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 232/232 [01:20<00:00,  2.88it/s]


In [35]:
files.download("/content/embeddings/train_LaBSE_solution.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train.head()

Unnamed: 0,Содержание,Сервис,Приоритет,Статус,Функциональная группа,Крайний срок,Дата обращения,Тип обращения на момент подачи,Тип обращения итоговый,Решение,...,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1014,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1015,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1016,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1017,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1018,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1019,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1020,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1021,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1022,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1023
0,Тест,АРМ,3-Низкий,Отменен,ФГ1,2018-01-24 07:42,2018-01-19 09:27,Запрос,Запрос,Тест,...,0.912514,-0.152208,0.130843,-0.09115,0.037932,-0.173681,0.097592,-0.399394,0.931276,-0.029832
1,Тестовое обращение,АРМ,3-Низкий,Отменен,ФГ1,2018-01-24 10:51,2018-01-19 09:36,Запрос,Запрос,Тест,...,0.871061,-0.177108,-0.046755,-0.148361,-0.555105,-0.178409,0.029617,-0.117339,0.968851,-0.136776
2,Выполнена разблокировка у/з REGION\*****,Сервис7,2-Средний,Закрыт,ФГ8,2018-01-31 07:51,2018-01-25 22:47,Запрос,Запрос,Выполнена разблокировка у/з Ne_bosS\*********\...,...,0.783239,-0.083413,-0.193294,-0.238923,0.424082,0.224345,0.60393,-0.027942,0.896323,-0.100545
3,.Не включается ПК,АРМ,3-Низкий,Закрыт,ФГ390,2018-01-28 05:50,2018-01-25 23:13,Инцидент,Инцидент,Выход из стоя БП. Заменён.,...,0.944462,0.012523,-0.020856,0.129501,-0.209041,-0.119825,0.692286,0.036641,0.928598,0.062176
4,"Заявка № ***********, *******************",АРМ,3-Низкий,Закрыт,ФГ303,2018-02-03 02:51,2018-01-25 23:39,Запрос,Запрос,Работы по обращению выполнены,...,0.668621,-0.042006,-0.053406,0.040616,-0.059162,0.172143,0.786471,-0.156235,-0.235574,-0.31236


In [None]:
train.head()

Unnamed: 0,Содержание,Сервис,Приоритет,Статус,Функциональная группа,Крайний срок,Дата обращения,Тип обращения на момент подачи,Тип обращения итоговый,Решение,...,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1014,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1015,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1016,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1017,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1018,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1019,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1020,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1021,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1022,sberbank-ai/sbert_large_mt_nlu_ru_Содержание_feature_1023
0,Тест,АРМ,3-Низкий,Отменен,ФГ1,2018-01-24 07:42,2018-01-19 09:27,Запрос,Запрос,Тест,...,0.912514,-0.152208,0.130843,-0.09115,0.037932,-0.173681,0.097592,-0.399394,0.931276,-0.029832
1,Тестовое обращение,АРМ,3-Низкий,Отменен,ФГ1,2018-01-24 10:51,2018-01-19 09:36,Запрос,Запрос,Тест,...,0.871061,-0.177108,-0.046755,-0.148361,-0.555105,-0.178409,0.029617,-0.117339,0.968851,-0.136776
2,Выполнена разблокировка у/з REGION\*****,Сервис7,2-Средний,Закрыт,ФГ8,2018-01-31 07:51,2018-01-25 22:47,Запрос,Запрос,Выполнена разблокировка у/з Ne_bosS\*********\...,...,0.783239,-0.083413,-0.193294,-0.238923,0.424082,0.224345,0.60393,-0.027942,0.896323,-0.100545
3,.Не включается ПК,АРМ,3-Низкий,Закрыт,ФГ390,2018-01-28 05:50,2018-01-25 23:13,Инцидент,Инцидент,Выход из стоя БП. Заменён.,...,0.944462,0.012523,-0.020856,0.129501,-0.209041,-0.119825,0.692286,0.036641,0.928598,0.062176
4,"Заявка № ***********, *******************",АРМ,3-Низкий,Закрыт,ФГ303,2018-02-03 02:51,2018-01-25 23:39,Запрос,Запрос,Работы по обращению выполнены,...,0.668621,-0.042006,-0.053406,0.040616,-0.059162,0.172143,0.786471,-0.156235,-0.235574,-0.31236


In [None]:
train.to_csv("train_SBERT_content.csv")

In [None]:
from google.colab import files
files.download("/content/embeddings/sbert_large_mt_nlu_ru_embeddings_test_final.npy")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [53]:
sbert_con = np.load('/content/embeddings/train_SBERT_content.npy')
sbert_sol = np.load('/content/embeddings/train_SBERT_solution.npy')
labse_con = np.load('/content/embeddings/train_LaBSE_content.npy')
labse_sol = np.load('/content/embeddings/train_LaBSE_solution.npy')