### Установка

In [2]:
!pip install setfit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Импорты и агрегация данных

In [4]:
from sentence_transformers.losses import CosineSimilarityLoss
from sklearn.model_selection import train_test_split
from setfit import SetFitModel, SetFitTrainer
from datasets import Dataset
import pandas as pd
import numpy as np
import json
import pickle


In [5]:
with open('/content/contract_enforcement.pkl', 'rb') as fp:
  contract_enforcement = pickle.load(fp)
print(len(contract_enforcement))
contract_enforcement = pd.Series(contract_enforcement)

contract_enforcement = contract_enforcement[~contract_enforcement.isna()]
contract_enforcement = contract_enforcement.values
len(contract_enforcement)

988


984

In [6]:
with open('/content/garantee_enforcement.pkl', 'rb') as fp:
  garantee_enforcement = pickle.load(fp)
print(len(garantee_enforcement))
garantee_enforcement = pd.Series(garantee_enforcement)

garantee_enforcement = garantee_enforcement[~garantee_enforcement.isna()]
garantee_enforcement = garantee_enforcement.values
len(garantee_enforcement)

811


508

In [7]:
with open('/content/nonetype.pkl', 'rb') as fp:
  nonetype_sentences = pickle.load(fp)
nonetype = []

for i in range(len(nonetype_sentences)):
  for j in range(len(nonetype_sentences[i])):
    if len(nonetype_sentences[i][j]) > 5:
      nonetype.append(nonetype_sentences[i][j])

nonetype = np.array(nonetype)
len(nonetype)

11232

In [8]:
nonetype[:5]

array(['Версия с 04.07.2022 года',
       'Ю «Государственное автономное учреждение',
       "Саха (Якутия) ''Республиканская больница №1 -",
       "Национальный центр медицины''»",
       'Подписано усиленной квалифицированной электронной'], dtype='<U867')

In [9]:
nonetype_1000 = np.random.choice(nonetype, 1000)
df_for_contract_enforcement = pd.DataFrame(list(set(contract_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_1000, columns=['sentence'])

In [10]:
df_for_contract_enforcement['label'] = 1
df_for_nonetype['label'] = 0

In [11]:
df_for_contract_enforcement.head()

Unnamed: 0,sentence,label
0,Размер обеспечения исполнения контракта 1635.0...,1
1,Размер обеспечения исполнения договора составл...,1
2,Размер обеспечения исполнения контракта 5 % от...,1
3,Размер обеспечения исполнения договора 10 % от...,1
4,Размер обеспечения исполнения Договора устанав...,1


In [12]:
df_for_nonetype.head()

Unnamed: 0,sentence,label
0,Кировского муниципального района,0
1,АО «АэроМар,0
2,В соответствии с Договором Поставщик обязуется,0
3,Наименование электронной площадки в информацио...,0
4,ИНН/КПП 3650003290/366401001,0


In [13]:
df_contract_enforcement_model = pd.concat([df_for_contract_enforcement, df_for_nonetype], axis=0)

In [14]:
df_contract_enforcement_model = df_contract_enforcement_model.sample(frac=1).reset_index(drop=True)

In [15]:
nonetype_500 = np.random.choice(nonetype, 500)
df_for_garantee_enforcement = pd.DataFrame(list(set(garantee_enforcement)), columns=['sentence'])
df_for_nonetype = pd.DataFrame(nonetype_500, columns=['sentence'])

df_for_garantee_enforcement['label'] = 1
df_for_nonetype['label'] = 0

df_for_garantee_enforcement_model = pd.concat([df_for_garantee_enforcement, df_for_nonetype], axis=0)



In [16]:
df_for_garantee_enforcement_model = df_for_garantee_enforcement_model.sample(frac=1).reset_index(drop=True)


### SetFit model

In [17]:
train, test = train_test_split(df_contract_enforcement_model, test_size=0.98)

In [20]:
len(train), len(test)

(31, 1567)

In [18]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-mpnet-base-v2")

trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    num_iterations=20,
    column_mapping={"sentence": "text", "label": "label"},
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [19]:
trainer.train()
metrics = trainer.evaluate()

Applying column mapping to training dataset
***** Running training *****
  Num examples = 1240
  Num epochs = 1
  Total optimization steps = 78
  Total train batch size = 16


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/78 [00:00<?, ?it/s]

Applying column mapping to evaluation dataset
***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Так как модель SetFit отлично работает на малом количестве данных, можем оставить основную часть датасета в валидационной выборке

In [21]:
metrics

{'accuracy': 0.982769623484365}

модель показывает accuracy 0.98 на наборе из 31 строки данных. 
