In [None]:
import transformers

from transformers import TFBertModel, BertTokenizer
from transformers import Trainer

import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import datasets
import pyarrow as pa
import pyarrow.dataset as ds
import numpy as np
from sklearn.model_selection import train_test_split

Создание датасета

In [None]:
import csv

import psycopg2
from dotenv import dotenv_values

config = dotenv_values(".env")
LOGIN = config["login"]
PASSWORD = config["password"]
DATABASE = config["database"]

tables = ['rf_subjects', 'rf_regions', 'rf_cities', 'addresses', 'jobs', 'departments', 'directions', 'sections', 'specialites', 'highscools', 'diplomas', 'mil_ranks', 'mil_specs', 'militaries', 'employee', 'paytypes', 'pays']

with psycopg2.connect(database=DATABASE, user=LOGIN, password=PASSWORD) as connection:
    with connection.cursor() as cur:
        for table in tables:
            cur.execute(f'SELECT * FROM {table}')
            data = cur.fetchall()

            with open(f'{table}.csv', 'w') as file:
                writer = csv.writer(file)
                writer.writerow([desc[0] for desc in cur.description])
                for row in data:
                    writer.writerow(row)

In [None]:
df = pd.DataFrame(data)

data = datasets.Dataset(pa.Table.from_pandas(df))

train_test_dataset = data.train_test_split(test_size=0.1)

# Создание модели
Модель будет входящий текст, и определять принадлежность каждого токена группе чувствительных данных

In [None]:

class TransformerClassificationModel(nn.Module):
    def __init__(self, model_name, base_transformer_model, num_classes):
        super().__init__()
        self.backbone = base_transformer_model
        self.name = model_name
        self.in_dim = self.backbone.pooler.dense.out_features if self.name == "model1" else self.backbone.bert.output_shape
        self.linlayers = nn.Sequential(
            nn.Linear(self.in_dim, 128),
            nn.BatchNorm1d(128),
            nn.Tanh(),
            nn.Linear(128, 32),
            nn.BatchNorm1d(32),
            nn.Tanh(),
            nn.Linear(32, num_classes),
        )

    def forward(self, inputs, x):
        out = self.backbone(x)
        outputs = self.linlayers(out)

        return outputs

In [None]:
def freeze_backbone_function(model: TransformerClassificationModel):
    if model.name == "model1":
        for params in model.backbone.parameters():
            params.requires_grad = False
    else:
        for params in model.backbone.bert.trainable_variables:
            params.requires_grad = False

In [None]:
def preprocess_function(examples):
    model_inputs = tokenizer(examples, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Тренировка модели

In [None]:
import copy

def train_transformer(transformer_model, train_dataset, eval_dataset, freeze_backbone=True, tokenizer=None, data_collator=None):
    model = copy.copy(transformer_model)
    if freeze_backbone == True:
        freeze_backbone_function(model)
    tokenized_train = train_dataset.map(preprocess_function, batched=True)
    tokenized_eval = eval_dataset.map(preprocess_function, batched=True)
    data_collator = transformers.DataCollatorForTokenClassification(tokenizer=tokenizer)

    training_args = transformers.TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_eval,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()

    return trainer.model

In [None]:
classes = ['name', 'capital', 'region', 'city', 'street', 'house', 'flat', 'boss', 'dept', 'seria', 'passport', 'birthdate date', 'age', 'surname', 'salary']

In [None]:
tokenizer = BertTokenizer.from_pretrained("tbs17/MathBERT")
backbone = TFBertModel.from_pretrained("tbs17/MathBERT", from_pt=True)
math_bert = TransformerClassificationModel("model2", backbone, classes)
math_bert_freeze_finetuned = train_transformer(transformer_model=math_bert, train_dataset=train_test_dataset['train'], eval_dataset=train_test_dataset['test'], freeze_backbone=True, tokenizer=tokenizer)

rubert_tiny_transformer_model = TransformerClassificationModel("model2", backbone, classes)
math_bert_full_finetuned = train_transformer(math_bert, train_dataset=train_test_dataset['train'], eval_dataset=train_test_dataset['test'], freeze_backbone=False, tokenizer=tokenizer)