### Установка

In [5]:
!pip install setfit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setfit
  Using cached setfit-0.7.0-py3-none-any.whl (45 kB)
Collecting sentence-transformers>=2.2.1
  Using cached sentence_transformers-2.2.2-py3-none-any.whl
Collecting datasets>=2.3.0
  Using cached datasets-2.11.0-py3-none-any.whl (468 kB)
Collecting evaluate>=0.3.0
  Using cached evaluate-0.4.0-py3-none-any.whl (81 kB)
Collecting xxhash
  Using cached xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting aiohttp
  Using cached aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Using cached dill-0.3.6-py3-none-any.whl (110 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.14-py39-none-any.whl (132 kB)
Collecting huggingface-hub<1.0.0,>=0.11.0
  Using cached huggingface_hub-0.13.4-py3-no

In [6]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Описание задачи

В этом модуле я обучил модель Setfit распознавать целевые предложения от нецелевых для label со значением обеспечение гарантийных обязательств. 

Особенность модели SetFit в том, что она не требует большого объема данных для хорошего результата, достаточно 30-50 записей для достижения хорошей сходимости. 

### Импорты и агрегация данных

In [7]:
from sentence_transformers.losses import CosineSimilarityLoss
from sklearn.model_selection import train_test_split
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
import pandas as pd
import numpy as np
import joblib
import json
import pickle
import gc

In [8]:
with open('/content/contract_enforcement.pkl', 'rb') as fp:
  contract_enforcement = pickle.load(fp)
print(len(contract_enforcement))
contract_enforcement = pd.Series(contract_enforcement)

contract_enforcement = contract_enforcement[~contract_enforcement.isna()]
contract_enforcement = contract_enforcement.values
len(contract_enforcement)

988


984

In [9]:
with open('/content/garantee_enforcement.pkl', 'rb') as fp:
  garantee_enforcement = pickle.load(fp)
print(len(garantee_enforcement))
garantee_enforcement = pd.Series(garantee_enforcement)

garantee_enforcement = garantee_enforcement[~garantee_enforcement.isna()]
garantee_enforcement = garantee_enforcement.values
len(garantee_enforcement)

811


508

In [10]:
with open('/content/nonetype.pkl', 'rb') as fp:
  nonetype_sentences = pickle.load(fp)

In [11]:
nonetype_1500 = np.random.choice(nonetype_sentences, 1500)
df_for_contract_enforcement = pd.DataFrame(list(set(contract_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_1500, columns=['sentence'])

In [12]:
df_for_contract_enforcement['label'] = 1
df_for_nonetype['label'] = 0

In [13]:
df_for_contract_enforcement.head()

Unnamed: 0,sentence,label
0,Размер обеспечения исполнения контракта 1239.4...,1
1,Размер обеспечения исполнения договора составл...,1
2,Размер обеспечения исполнения договора составл...,1
3,Обеспечение исполнения гражданско-правового до...,1
4,Размер обеспечения исполнения контракта устана...,1


In [14]:
df_for_nonetype.head()

Unnamed: 0,sentence,label
0,Извещение о проведении запроса котировок в эле...,0
1,Номер извещения 0162300003622000143,0
2,Гарантийные обязательства могут обеспечиваться...,0
3,Обеспечение исполнения Договора распространяет...,0
4,Операционный департамент,0


In [15]:
df_contract_enforcement_model = pd.concat([df_for_contract_enforcement, df_for_nonetype], axis=0)

In [16]:
df_contract_enforcement_model = df_contract_enforcement_model.sample(frac=1).reset_index(drop=True)

In [17]:
nonetype_1000 = np.random.choice(nonetype_sentences, 1000)
df_for_garantee_enforcement = pd.DataFrame(list(set(garantee_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_1000, columns=['sentence'])

df_for_garantee_enforcement['label'] = 1
df_for_nonetype['label'] = 0

df_for_garantee_enforcement_model = pd.concat([df_for_garantee_enforcement, df_for_nonetype], axis=0)



In [18]:
df_for_garantee_enforcement_model = df_for_garantee_enforcement_model.sample(frac=1).reset_index(drop=True)


### garantee enforcement SetFit model batch size = 3


In [19]:
train, test = train_test_split(df_for_garantee_enforcement_model, test_size=0.9)

In [20]:
len(train), len(test)

(144, 1302)

In [21]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

# используем предобученную модель
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    batch_size=3,
    column_mapping={"sentence": "text", "label": "label"},
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading (…)f39ef/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)0182ff39ef/README.md:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading (…)82ff39ef/config.json:   0%|          | 0.00/594 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)f39ef/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)0182ff39ef/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)2ff39ef/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [22]:
# обучение модели
trainer.train()
metrics = trainer.evaluate()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 5760
  Num epochs = 1
  Total optimization steps = 1920
  Total train batch size = 3


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1920 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [23]:
metrics

{'accuracy': 0.9646697388632872}

### Сохранение результатов

In [24]:
joblib.dump(model, 'garantee_enforcement_setfit_model.joblib')

['garantee_enforcement_setfit_model.joblib']

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
joblib.dump(model, '/content/drive/MyDrive/забег/garantee_enforcement_setfit_model.joblib')

['/content/drive/MyDrive/забег/garantee_enforcement_setfit_model.joblib']

In [27]:
from google.colab import files
files.download('garantee_enforcement_setfit_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
garantee_enforcement_trainer = joblib.load('./garantee_enforcement_setfit_model.joblib')