# Pipeline

## Чтение данных

In [1]:
import pandas as pd
import numpy as np
from utils.common import *

from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
def display_features_targets(data, target):
    display(data)
    for key, item in target.items():
        display(item)

In [81]:
data['по дополнительному признаку'].value_counts()

по дополнительному признаку
[без опыта]                         461
[для женщин]                        124
[для студентов]                      98
[для пенсионеров]                    53
[для школьников]                     41
[для мужчин]                         34
[для инвалидов]                      16
[для пенсионеров, для женщин]         4
[для женщин, для пенсионеров]         3
[без опыта, для студентов]            3
[для женщин, без опыта]               3
[для пенсионеров, для мужчин]         2
[для мужчин, для женщин]              2
[для школьников, для студентов]       1
[для пенсионеров, для инвалидов]      1
[без опыта, для женщин]               1
[для студентов, для школьников]       1
[для студентов, без опыта]            1
Name: count, dtype: int64

In [None]:
data = pd.read_csv("data/answers.csv", index_col=0)

data['query'] = data['query'].apply(lemmatize_sentence)


train_data, test_data = train_test_split(data, test_size=0.2, random_state=43)

data = clear_dataset(data)

train_data = clear_dataset(train_data)
test_data = clear_dataset(test_data)

train_data, train_targets = prepare_dataset(train_data)
test_data, test_targets = prepare_dataset(test_data)

display_features_targets(train_data, train_targets)

In [5]:
train_targets.keys()

dict_keys(['занятость', 'по должности-лемме', 'по дополнительному признаку', 'общие фразы', 'по условиям'])

In [7]:
count_vectorizer = CountVectorizer(decode_error="ignore")
count_vectorizer.fit(train_data)

X_train = count_vectorizer.transform(train_data)
X_test = count_vectorizer.transform(test_data)
X_train.shape, X_test.shape

((11396, 3496), (2849, 3496))

# Обучение модели

### Столбец "занятость"

In [47]:
col = "занятость"

#display(data[col].explode().value_counts())

model = CatBoostClassifier()
mbc = MultiLabelBinarizer()
mbc.fit(train_targets[col])

train_add_mbc = mbc.transform(train_targets[col])
test_add_mbc = mbc.transform(test_targets[col])

#display(train_add_mbc.sum(axis=0), test_add_mbc.sum(axis=0))
#print(train_add_mbc.shape, test_add_mbc.shape)

In [48]:
pd.DataFrame({"classes": mbc.classes_, "test_size": test_add_mbc.sum(axis=0)})

Unnamed: 0,classes,test_size
0,вахта,82
1,вечерняя,12
2,временная,2
3,дневная,0
4,на дому,35
5,на неполный день,31
6,ночная,10
7,по выходным,11
8,подработка,86
9,посменная,1


In [None]:
catboost_model = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=20, auto_class_weights="Balanced"

)
ovr_busy = OneVsRestClassifier(
    estimator=catboost_model
)
ovr_busy.fit(X_train, train_add_mbc)
pred_labels = ovr_busy.predict(X_test)

Learning rate set to 0.5
0:	learn: 0.0105865	total: 40.2ms	remaining: 763ms
1:	learn: 0.0008921	total: 49.2ms	remaining: 443ms
2:	learn: 0.0002564	total: 56.1ms	remaining: 318ms
3:	learn: 0.0002427	total: 63.7ms	remaining: 255ms
4:	learn: 0.0002291	total: 70.1ms	remaining: 210ms
5:	learn: 0.0002225	total: 78.3ms	remaining: 183ms
6:	learn: 0.0002161	total: 86.9ms	remaining: 161ms
7:	learn: 0.0002109	total: 94ms	remaining: 141ms
8:	learn: 0.0002105	total: 101ms	remaining: 124ms
9:	learn: 0.0002052	total: 108ms	remaining: 108ms
10:	learn: 0.0001012	total: 115ms	remaining: 93.9ms
11:	learn: 0.0001006	total: 121ms	remaining: 80.8ms
12:	learn: 0.0001006	total: 128ms	remaining: 69.1ms
13:	learn: 0.0000997	total: 135ms	remaining: 57.7ms
14:	learn: 0.0000997	total: 142ms	remaining: 47.3ms
15:	learn: 0.0000997	total: 149ms	remaining: 37.2ms
16:	learn: 0.0001000	total: 156ms	remaining: 27.5ms
17:	learn: 0.0000996	total: 163ms	remaining: 18.1ms
18:	learn: 0.0000999	total: 170ms	remaining: 8.96ms
1

In [11]:
print(classification_report(test_add_mbc, pred_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        82
           1       1.00      1.00      1.00        12
           2       1.00      1.00      1.00         2
           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00        35
           5       1.00      1.00      1.00        31
           6       1.00      1.00      1.00        10
           7       1.00      1.00      1.00        11
           8       1.00      1.00      1.00        86
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00        57

   micro avg       1.00      1.00      1.00       328
   macro avg       0.92      0.92      0.92       328
weighted avg       1.00      1.00      1.00       328
 samples avg       0.11      0.11      0.11       328



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Столбец "Условия"

In [None]:
# Условия
catboost_model_condition = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=1000, auto_class_weights="Balanced"

)
col = "по условиям"
catboost_model_condition.fit(X_train, train_targets[col])
pred_labels = catboost_model_condition.predict(X_test)

Learning rate set to 0.106626
0:	learn: 0.8973568	total: 9.73ms	remaining: 9.72s
1:	learn: 0.7509627	total: 16.3ms	remaining: 8.15s
2:	learn: 0.6381484	total: 23.6ms	remaining: 7.85s
3:	learn: 0.5480868	total: 30.7ms	remaining: 7.64s
4:	learn: 0.4744472	total: 38.1ms	remaining: 7.58s
5:	learn: 0.4131864	total: 44.6ms	remaining: 7.39s
6:	learn: 0.3615560	total: 52.1ms	remaining: 7.4s
7:	learn: 0.3175987	total: 60.6ms	remaining: 7.51s
8:	learn: 0.2798713	total: 68.1ms	remaining: 7.5s
9:	learn: 0.2472776	total: 75.9ms	remaining: 7.51s
10:	learn: 0.2189669	total: 82.7ms	remaining: 7.44s
11:	learn: 0.1942653	total: 89.8ms	remaining: 7.4s
12:	learn: 0.1726306	total: 96.6ms	remaining: 7.33s
13:	learn: 0.1536209	total: 103ms	remaining: 7.28s
14:	learn: 0.1368715	total: 110ms	remaining: 7.21s
15:	learn: 0.1220786	total: 120ms	remaining: 7.37s
16:	learn: 0.1089866	total: 131ms	remaining: 7.57s
17:	learn: 0.0973793	total: 142ms	remaining: 7.72s
18:	learn: 0.0870722	total: 153ms	remaining: 7.92s
1

In [68]:
print(classification_report(test_targets[col], pred_labels))


                      precision    recall  f1-score   support

                None       1.00      1.00      1.00      2730
с ежедневной оплатой       1.00      1.00      1.00        81
       с проживанием       1.00      1.00      1.00        38

            accuracy                           1.00      2849
           macro avg       1.00      1.00      1.00      2849
        weighted avg       1.00      1.00      1.00      2849



In [75]:
# Условия
catboost_model_general_question = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=1000, auto_class_weights="Balanced"

)
col = "общие фразы"
catboost_model_general_question.fit(X_train, train_targets[col])
pred_labels = catboost_model_general_question.predict(X_test)

Learning rate set to 0.029488
0:	learn: 0.6789051	total: 50.3ms	remaining: 50.2s
1:	learn: 0.6663189	total: 91.8ms	remaining: 45.8s
2:	learn: 0.6544527	total: 136ms	remaining: 45.1s
3:	learn: 0.6432331	total: 179ms	remaining: 44.7s
4:	learn: 0.6333361	total: 219ms	remaining: 43.5s
5:	learn: 0.6253846	total: 254ms	remaining: 42.1s
6:	learn: 0.6181828	total: 294ms	remaining: 41.8s
7:	learn: 0.6117333	total: 334ms	remaining: 41.4s
8:	learn: 0.6047716	total: 370ms	remaining: 40.7s
9:	learn: 0.5987752	total: 405ms	remaining: 40.1s
10:	learn: 0.5933526	total: 441ms	remaining: 39.6s
11:	learn: 0.5880674	total: 477ms	remaining: 39.3s
12:	learn: 0.5829370	total: 513ms	remaining: 38.9s
13:	learn: 0.5791248	total: 545ms	remaining: 38.4s
14:	learn: 0.5748143	total: 586ms	remaining: 38.5s
15:	learn: 0.5704168	total: 620ms	remaining: 38.1s
16:	learn: 0.5667422	total: 653ms	remaining: 37.8s
17:	learn: 0.5630258	total: 689ms	remaining: 37.6s
18:	learn: 0.5600354	total: 726ms	remaining: 37.5s
19:	learn

In [76]:
print(classification_report(test_targets[col], pred_labels))


              precision    recall  f1-score   support

           0       1.00      0.88      0.94      1812
           1       0.82      1.00      0.90      1037

    accuracy                           0.92      2849
   macro avg       0.91      0.94      0.92      2849
weighted avg       0.94      0.92      0.92      2849



### Столбец "дополнительный признак"

In [71]:
col = "по дополнительному признаку"

#display(data[col].explode().value_counts())

model = CatBoostClassifier()
mbc = MultiLabelBinarizer()
mbc.fit(train_targets[col])

train_add_mbc = mbc.transform(train_targets[col])
test_add_mbc = mbc.transform(test_targets[col])

pd.DataFrame({"classes": mbc.classes_, "test_size": test_add_mbc.sum(axis=0)})

catboost_model = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=20, auto_class_weights="Balanced"

)
ovr_additional_feature = OneVsRestClassifier(
    estimator=catboost_model
)
ovr_additional_feature.fit(X_train, train_add_mbc)
pred_labels = ovr_additional_feature.predict(X_test)

Learning rate set to 0.5
0:	learn: 0.0105302	total: 7.15ms	remaining: 136ms
1:	learn: 0.0008817	total: 14.4ms	remaining: 130ms
2:	learn: 0.0002539	total: 21ms	remaining: 119ms
3:	learn: 0.0002401	total: 28.1ms	remaining: 112ms
4:	learn: 0.0001228	total: 34.6ms	remaining: 104ms
5:	learn: 0.0001166	total: 41.7ms	remaining: 97.3ms
6:	learn: 0.0001122	total: 48.3ms	remaining: 89.7ms
7:	learn: 0.0000737	total: 55.2ms	remaining: 82.8ms
8:	learn: 0.0000723	total: 63.1ms	remaining: 77.1ms
9:	learn: 0.0000686	total: 70.6ms	remaining: 70.6ms
10:	learn: 0.0000667	total: 79.2ms	remaining: 64.8ms
11:	learn: 0.0000661	total: 86.2ms	remaining: 57.4ms
12:	learn: 0.0000656	total: 97.7ms	remaining: 52.6ms
13:	learn: 0.0000651	total: 106ms	remaining: 45.3ms
14:	learn: 0.0000647	total: 112ms	remaining: 37.4ms
15:	learn: 0.0000646	total: 119ms	remaining: 29.8ms
16:	learn: 0.0000634	total: 126ms	remaining: 22.2ms
17:	learn: 0.0000632	total: 134ms	remaining: 14.9ms
18:	learn: 0.0000630	total: 141ms	remaining

array([0, 0, 0, 0, 0, 0, 0])

## Мультиклассификация

In [12]:
mc_col = "по должности-лемме"

catboost_model = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=1000, loss_function="MultiClass"
)
catboost_model.fit(X_train, train_targets[mc_col])
pred_labels = catboost_model.predict(X_test)

Learning rate set to 0.106626
0:	learn: 5.9290472	total: 382ms	remaining: 6m 21s
1:	learn: 29.8489986	total: 680ms	remaining: 5m 39s
2:	learn: 51.2880068	total: 971ms	remaining: 5m 22s
3:	learn: 46.4590097	total: 1.34s	remaining: 5m 33s
4:	learn: 17.1913829	total: 1.72s	remaining: 5m 41s
5:	learn: 64.0140181	total: 2.07s	remaining: 5m 43s
6:	learn: 7.1619661	total: 2.43s	remaining: 5m 45s
7:	learn: 5.1443863	total: 2.82s	remaining: 5m 49s
8:	learn: 4.3935944	total: 3.14s	remaining: 5m 45s
9:	learn: 3.7053918	total: 3.44s	remaining: 5m 40s
10:	learn: 2.9892434	total: 3.73s	remaining: 5m 35s
11:	learn: 2.4510589	total: 4.04s	remaining: 5m 32s
12:	learn: 2.3498860	total: 4.35s	remaining: 5m 30s
13:	learn: 2.2901266	total: 4.68s	remaining: 5m 29s
14:	learn: 2.2395994	total: 4.99s	remaining: 5m 27s
15:	learn: 2.2057371	total: 5.27s	remaining: 5m 23s
16:	learn: 2.1663899	total: 5.56s	remaining: 5m 21s
17:	learn: 2.1301570	total: 5.83s	remaining: 5m 18s
18:	learn: 2.0907953	total: 6.12s	remai

In [13]:
print(classification_report(test_targets[mc_col], pred_labels))

                                                   precision    recall  f1-score   support

                                             None       0.87      1.00      0.93      1274
                                    авторазборщик       0.00      0.00      0.00         1
                                      автослесарь       0.00      0.00      0.00         3
                                     автоэлектрик       1.00      1.00      1.00         3
                                          адвокат       0.00      0.00      0.00         2
                                    администратор       1.00      1.00      1.00        14
                                           акушер       0.00      0.00      0.00         1
                                         аналитик       1.00      1.00      1.00         4
                                         аниматор       1.00      0.50      0.67         2
                                           артист       1.00      1.00      1.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Сохранение моделей

In [None]:
catboost_model
import joblib

# Сохранение модели
#joblib.dump(catboost_model, 'models/job_classifier_model.pkl')
#joblib.dump(ovr_busy, 'models/busy_classifier_model.pkl')
#joblib.dump(ovr_additional_feature, 'models/additional_feature_classifier_model.pkl')
#joblib.dump(catboost_model_condition, 'models/condition_classifier_model.pkl')
#joblib.dump(catboost_model_general_question, 'models/general_question_classifier_model.pkl')
#joblib.dump(count_vectorizer, 'models/count_vectorizer.pkl')


['models/count_vectorizer.pkl']