# Новый раздел

In [None]:
import tensorflow as tf
import torch
!nvidia-smi

print(torch.cuda.is_available())
print(tf.test.gpu_device_name())

Thu Oct  5 20:41:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0    33W /  70W |   1855MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# %pip install transformers
from transformers import BertTokenizer
import torch
from torch.utils.data import DataLoader


tokenizer = BertTokenizer.from_pretrained('cointegrated/rubert-tiny')


class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):
        # Normalize the similarity scores in the dataset
        self.similarity_scores = [float(i) for i in dataset["ИндексПохожести"]]
        self.first_sentences = [i for i in dataset["НазваниеОписание"]]
        self.second_sentences = [i for i in dataset["Категория"]]
        self.concatenated_sentences = [[str(x), str(y)] for x, y in zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

In [None]:
import torch


class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs]).to(device)
        emb_2 = torch.stack([inp[1] for inp in inputs]).to(device)
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2)).to(device)
        return self.loss_fn(outputs, labels.squeeze())

In [None]:
import torch
from sentence_transformers import SentenceTransformer, models

'bert-base-uncased'
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('cointegrated/rubert-tiny', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        """
        :param input_data: dict('input_ids': tensor([2, 128]), 'attention_mask': tensor([2, 128]))
        :return: tensor ([2, 768])
        """
        output = self.sts_bert(input_data)['sentence_embedding'].to(device)
        return output


In [None]:
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [None]:
import torch
from tqdm import tqdm
import random


def train(model, device, optimizer, scheduler, train_dataloader, validation_dataloader, epochs=10):
    seed_val = 42

    criterion = CosineSimilarityLoss()
    criterion = criterion.to(device)

    random.seed(seed_val)
    torch.manual_seed(seed_val)

    training_stats = []

    for epoch in range(0, epochs):

        # Training

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
        print('Training...')

        total_train_loss = 0

        model.train()
        # model.eval()

        # For each batch of training data...
        for train_data, train_label in tqdm(train_dataloader):
            train_data['input_ids'] = train_data['input_ids'].to(device)
            train_data['attention_mask'] = train_data['attention_mask'].to(device)

            train_data = collate_fn(train_data)
            model.zero_grad()

            output = [model(feature) for feature in train_data]

            loss = criterion(output, train_label.to(device))
            total_train_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        print("")
        print("  Average training loss: {0:.5f}".format(avg_train_loss))

        #  Validation

        print("")
        print("Running Validation...")

        total_eval_loss = 0

        model.eval()

        # Evaluate data for one epoch
        for val_data, val_label in tqdm(validation_dataloader):
            val_data['input_ids'] = val_data['input_ids'].to(device)
            val_data['attention_mask'] = val_data['attention_mask'].to(device)

            val_data = collate_fn(val_data)

            with torch.no_grad():
                output = [model(feature) for feature in val_data]

            loss = criterion(output, val_label.to(device))
            total_eval_loss += loss.item()

        # Calculate the average loss over all the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        print("  Validation Loss: {0:.5f}".format(avg_val_loss))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
            }
        )

    print("")
    print("Training complete!")

    return model, training_stats


In [None]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader
import time
import pandas as pd

#from testing import predict_similarity
from sklearn.model_selection import train_test_split

path_info_with_sim = "/home/mikhail/Projects/bert_ozon/data/csv_with_similarity_score/info_with_sim_neg.csv"


if __name__ == '__main__':

    # if torch.cuda.is_available():
    #     device = torch.device("cuda")
    # else:
    #     device = torch.device("cpu")
    device = 0

    df = pd.read_csv(path_info_csv, delimiter=',', index_col=False)
    train_ds, test_ds = train_test_split(df, test_size=0.2, random_state=0)
    train_ds = STSBDataset(train_ds)
    test_ds = STSBDataset(test_ds)

    model = BertForSTS()

    batch_size = 8

    train_dataloader = DataLoader(
        train_ds,
        num_workers=4,
        batch_size=batch_size,
        shuffle=True
    )

    validation_dataloader = DataLoader(
        test_ds,
        num_workers=4,
        batch_size=batch_size
    )

    # Chose optimizer
    optimizer = AdamW(model.parameters(),
                      lr=1e-6)

    # Create scheduler
    epochs = 1
    # Total number of training steps is [number of batches] x [number of epochs].
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    # Train model
    model, training_stats = train(model=model.to(device),
                                  device=device,
                                  optimizer=optimizer,
                                  scheduler=scheduler,
                                  train_dataloader=train_dataloader,
                                  validation_dataloader=validation_dataloader,
                                  epochs=epochs)

    model_path_save = "model.pth"
    torch.save(model.state_dict(), model_path_save)

    training_stats_path_save = "training_stats.txt"
    with open(training_stats_path_save, 'w') as f:
        for item in training_stats:
            f.writelines("%s\n" % item)







Training...


  1%|          | 19/2276 [00:03<07:30,  5.01it/s]


KeyboardInterrupt: ignored