# Черновой pipeline

## Чтение данных

In [2]:
import pandas as pd
import numpy as np
from utils.common import *

from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def display_features_targets(data, target):
    display(data)
    for key, item in target.items():
        display(item)

In [4]:
data = pd.read_csv("data/answers.csv", index_col=0)

train_data, test_data = train_test_split(data, test_size=0.2, random_state=43)

data = clear_dataset(data)

train_data = clear_dataset(train_data)
test_data = clear_dataset(test_data)

train_data, train_targets = prepare_dataset(train_data)
test_data, test_targets = prepare_dataset(test_data)

display_features_targets(train_data, train_targets)

0                    сколько зарабатывает яндекс курьер
1                                                 вахта
2                          работа по контракту вакансии
3                                     маркшейдер работа
4                техник по эксплуатации зданий вакансии
                              ...                      
11391                       копирайтер без опыта работы
11392             работа технолог общественного питания
11393                  работа электрика свежие вакансии
11394    штукатур маляр вакансии без посредников свежие
11395                                  шикотан вакансии
Name: query, Length: 11396, dtype: object

0             []
1        [вахта]
2             []
3             []
4             []
          ...   
11391         []
11392         []
11393         []
11394         []
11395         []
Name: занятость, Length: 11396, dtype: object

0              курьер
1              вахтер
2                None
3          маркшейдер
4           зоотехник
             ...     
11391      копирайтер
11392        технолог
11393    автоэлектрик
11394        штукатур
11395            None
Name: по должности-лемме, Length: 11396, dtype: object

0                 []
1                 []
2                 []
3                 []
4                 []
            ...     
11391    [без опыта]
11392             []
11393             []
11394             []
11395             []
Name: по дополнительному признаку, Length: 11396, dtype: object

0        0
1        0
2        1
3        0
4        0
        ..
11391    0
11392    0
11393    0
11394    0
11395    1
Name: общие фразы, Length: 11396, dtype: int64

0        None
1        None
2        None
3        None
4        None
         ... 
11391    None
11392    None
11393    None
11394    None
11395    None
Name: по условиям, Length: 11396, dtype: object

In [5]:
tf_idf = TfidfVectorizer(decode_error="ignore")
tf_idf.fit(train_data)

X_train = tf_idf.transform(train_data)
X_test = tf_idf.transform(test_data)
X_train.shape, X_test.shape

((11396, 5195), (2849, 5195))

# Черновик предсказания

In [6]:
col = "занятость"

display(data[col].explode().value_counts())

model = CatBoostClassifier()
mbc = MultiLabelBinarizer()
mbc.fit(train_targets[col])

train_add_mbc = mbc.transform(train_targets[col])
test_add_mbc = mbc.transform(test_targets[col])

display(train_add_mbc.sum(axis=0), test_add_mbc.sum(axis=0))
print(train_add_mbc.shape, test_add_mbc.shape)

занятость
подработка          435
вахта               407
удаленная           302
на неполный день    212
на дому             181
по выходным          69
ночная               65
вечерняя             52
временная            14
посменная             5
дневная               4
посуточная            3
Name: count, dtype: int64

array([325,  40,  12,   4, 146, 144,  55,  58, 349,   4,   2, 245])

array([82, 12,  2,  0, 35, 31, 10, 11, 86,  1,  1, 57])

(11396, 12) (2849, 12)


In [7]:
pd.DataFrame({"classes": mbc.classes_, "test_size": test_add_mbc.sum(axis=0)})

Unnamed: 0,classes,test_size
0,вахта,82
1,вечерняя,12
2,временная,2
3,дневная,0
4,на дому,35
5,на неполный день,31
6,ночная,10
7,по выходным,11
8,подработка,86
9,посменная,1


In [8]:
catboost_model = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=20, auto_class_weights="Balanced"

)
ovr = OneVsRestClassifier(
    estimator=catboost_model
)
ovr.fit(X_train, train_add_mbc)
pred_labels = ovr.predict(X_test)

Learning rate set to 0.5
0:	learn: 0.0765121	total: 10.2ms	remaining: 194ms
1:	learn: 0.0145643	total: 15.4ms	remaining: 138ms
2:	learn: 0.0047786	total: 20.2ms	remaining: 115ms
3:	learn: 0.0005695	total: 25.2ms	remaining: 101ms
4:	learn: 0.0004617	total: 29.9ms	remaining: 89.8ms
5:	learn: 0.0001637	total: 35ms	remaining: 81.8ms
6:	learn: 0.0001224	total: 39.7ms	remaining: 73.8ms
7:	learn: 0.0001149	total: 44.5ms	remaining: 66.8ms
8:	learn: 0.0000697	total: 49.4ms	remaining: 60.4ms
9:	learn: 0.0000501	total: 54.2ms	remaining: 54.2ms
10:	learn: 0.0000482	total: 59.1ms	remaining: 48.3ms
11:	learn: 0.0000462	total: 63.8ms	remaining: 42.5ms
12:	learn: 0.0000355	total: 68.6ms	remaining: 36.9ms
13:	learn: 0.0000334	total: 73.8ms	remaining: 31.6ms
14:	learn: 0.0000329	total: 78.9ms	remaining: 26.3ms
15:	learn: 0.0000301	total: 84.1ms	remaining: 21ms
16:	learn: 0.0000282	total: 89.3ms	remaining: 15.8ms
17:	learn: 0.0000239	total: 94.5ms	remaining: 10.5ms
18:	learn: 0.0000231	total: 99.6ms	rema

In [9]:
print(classification_report(test_add_mbc, pred_labels))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        82
           1       0.92      0.92      0.92        12
           2       1.00      1.00      1.00         2
           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00        35
           5       1.00      0.94      0.97        31
           6       0.90      0.90      0.90        10
           7       1.00      1.00      1.00        11
           8       1.00      1.00      1.00        86
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       1.00      1.00      1.00        57

   micro avg       0.99      0.98      0.99       328
   macro avg       0.73      0.73      0.73       328
weighted avg       0.99      0.98      0.98       328
 samples avg       0.10      0.10      0.10       328



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Мультиклассификация

In [21]:
mc_col = "по должности-лемме"

catboost_model = CatBoostClassifier(
    random_state=43, task_type="GPU", devices="0",
    iterations=2000, loss_function="MultiClass"
)
catboost_model.fit(X_train, train_targets[mc_col])
pred_labels = catboost_model.predict(X_test)

Learning rate set to 0.059855
0:	learn: 3.7960112	total: 599ms	remaining: 19m 57s
1:	learn: 3.4535545	total: 1.17s	remaining: 19m 24s
2:	learn: 3.2783419	total: 1.73s	remaining: 19m 9s
3:	learn: 3.1583721	total: 2.28s	remaining: 18m 59s
4:	learn: 3.0432714	total: 2.82s	remaining: 18m 44s
5:	learn: 2.9493144	total: 3.32s	remaining: 18m 24s
6:	learn: 2.8744766	total: 3.81s	remaining: 18m 3s
7:	learn: 2.8165171	total: 4.3s	remaining: 17m 51s
8:	learn: 2.7657885	total: 4.79s	remaining: 17m 39s
9:	learn: 2.7103234	total: 5.26s	remaining: 17m 26s
10:	learn: 2.6617219	total: 5.72s	remaining: 17m 14s
11:	learn: 2.6177221	total: 6.2s	remaining: 17m 7s
12:	learn: 2.5800127	total: 6.67s	remaining: 16m 59s
13:	learn: 2.5501492	total: 7.16s	remaining: 16m 55s
14:	learn: 2.5202440	total: 7.66s	remaining: 16m 53s
15:	learn: 2.4875402	total: 8.13s	remaining: 16m 48s
16:	learn: 2.4624342	total: 8.62s	remaining: 16m 45s
17:	learn: 2.4455771	total: 9.13s	remaining: 16m 45s
18:	learn: 2.4194473	total: 9.6

In [24]:
print(classification_report(test_targets[mc_col], pred_labels))

                                                   precision    recall  f1-score   support

                                             None       0.76      1.00      0.87      1274
                                    авторазборщик       0.00      0.00      0.00         1
                                      автослесарь       0.00      0.00      0.00         3
                                     автоэлектрик       1.00      0.33      0.50         3
                                          адвокат       0.00      0.00      0.00         2
                                    администратор       1.00      0.93      0.96        14
                                           акушер       0.00      0.00      0.00         1
                                         аналитик       1.00      0.75      0.86         4
                                         аниматор       1.00      0.50      0.67         2
                                           артист       1.00      1.00      1.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [26]:
tf_idf.transform(test_targets["занятость"])

AttributeError: 'list' object has no attribute 'lower'

In [33]:
from sklearn.metrics.pairwise import cosine_similarity
X_train.shape, tf_idf.transform(test_targets["по должности-лемме"]).shape

((11396, 5195), (2849, 5195))

In [47]:
ind_pred = np.argmax(cosine_similarity(X_test, tf_idf.transform(test_targets["по должности-лемме"])), axis=1)
pred = test_targets["по должности-лемме"][ind_pred]

In [49]:
print(accuracy_score(test_targets["по должности-лемме"], pred))

0.32186732186732187


In [52]:
pd.concat([test_data, pred], axis=1, ignore_index=True)

ValueError: cannot reindex on an axis with duplicate labels

In [56]:
test_targets["по должности-лемме"].value_counts()

по должности-лемме
None                       1274
водитель                    189
курьер                       87
модератор                    48
помощник                     46
                           ... 
парковщик                     1
методист                      1
математик                     1
наладчик                      1
системный администратор       1
Name: count, Length: 219, dtype: int64