### Установка

In [None]:
!pip install setfit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Импорты и агрегация данных

In [None]:
from sentence_transformers.losses import CosineSimilarityLoss
from sklearn.model_selection import train_test_split
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
import pandas as pd
import numpy as np
import joblib
import json
import pickle
import gc

In [None]:
with open('/content/contract_enforcement.pkl', 'rb') as fp:
  contract_enforcement = pickle.load(fp)
print(len(contract_enforcement))
contract_enforcement = pd.Series(contract_enforcement)

contract_enforcement = contract_enforcement[~contract_enforcement.isna()]
contract_enforcement = contract_enforcement.values
len(contract_enforcement)

988


984

In [None]:
with open('/content/garantee_enforcement.pkl', 'rb') as fp:
  garantee_enforcement = pickle.load(fp)
print(len(garantee_enforcement))
garantee_enforcement = pd.Series(garantee_enforcement)

garantee_enforcement = garantee_enforcement[~garantee_enforcement.isna()]
garantee_enforcement = garantee_enforcement.values
len(garantee_enforcement)

811


508

In [None]:
with open('/content/nonetype.pkl', 'rb') as fp:
  nonetype_sentences = pickle.load(fp)

In [None]:
nonetype_1500 = np.random.choice(nonetype_sentences, 1500)
df_for_contract_enforcement = pd.DataFrame(list(set(contract_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_1500, columns=['sentence'])

In [None]:
df_for_contract_enforcement['label'] = 1
df_for_nonetype['label'] = 0

In [None]:
df_for_contract_enforcement.head()

Unnamed: 0,sentence,label
0,Размер обеспечения исполнения контракта 28 025...,1
1,Размер обеспечения исполнения контракта 169932...,1
2,Размер обеспечения исполнения договора устанав...,1
3,Обеспечение исполнения настоящего Контракта ус...,1
4,Обеспечение исполнения настоящего Контракта пр...,1


In [None]:
df_for_nonetype.head()

Unnamed: 0,sentence,label
0,Александр,0
1,Участникам,0
2,Приобретение квартиры в с.,0
3,Если при проведении электронного аукциона учас...,0
4,"Подрядчику денежных средств, внесенных в качес...",0


In [None]:
df_contract_enforcement_model = pd.concat([df_for_contract_enforcement, df_for_nonetype], axis=0)

In [None]:
df_contract_enforcement_model = df_contract_enforcement_model.sample(frac=1).reset_index(drop=True)

In [None]:
nonetype_1000 = np.random.choice(nonetype_sentences, 1000)
df_for_garantee_enforcement = pd.DataFrame(list(set(garantee_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_1000, columns=['sentence'])

df_for_garantee_enforcement['label'] = 1
df_for_nonetype['label'] = 0

df_for_garantee_enforcement_model = pd.concat([df_for_garantee_enforcement, df_for_nonetype], axis=0)



In [None]:
df_for_garantee_enforcement_model = df_for_garantee_enforcement_model.sample(frac=1).reset_index(drop=True)


### SetFit model

In [None]:
train, test = train_test_split(df_contract_enforcement_model, test_size=0.98)

In [None]:
len(train), len(test)

(31, 1567)

In [None]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    column_mapping={"sentence": "text", "label": "label"},
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
trainer.train()
metrics = trainer.evaluate()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1240
  Num epochs = 1
  Total optimization steps = 78
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/78 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Так как модель SetFit отлично работает на малом количестве данных, можем оставить основную часть датасета в валидационной выборке

In [None]:
metrics

{'accuracy': 0.982769623484365}

In [None]:
del trainer

In [None]:
gc.collect()


217

модель показывает accuracy 0.98 на наборе из 31 строки данных. 


### garantee enforcement SetFit model batch size = 3


In [None]:
train, test = train_test_split(df_for_garantee_enforcement_model, test_size=0.9)

In [None]:
len(train), len(test)

(144, 1302)

In [None]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    batch_size=3,
    column_mapping={"sentence": "text", "label": "label"},
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [None]:
trainer.train()
metrics = trainer.evaluate()

Applying column mapping to training dataset


Generating Training Pairs:   0%|          | 0/20 [00:00<?, ?it/s]

***** Running training *****
  Num examples = 5760
  Num epochs = 1
  Total optimization steps = 1920
  Total train batch size = 3


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1920 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
metrics

{'accuracy': 0.9738863287250384}

In [None]:
joblib.dump(model, 'garantee_enforcement_setfit_model.joblib')

['garantee_enforcement_setfit_model.joblib']

In [None]:
from google.colab import files
files.download('garantee_enforcement_setfit_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Загрузка contract_enforcement модели из дампа

In [None]:
garantee_enforcement_trainer = joblib.load('./garantee_enforcement_setfit_model.joblib')