In [None]:
! pip install torch torchvision torchaudio
! pip install transformers
! pip install pandas openpyxl

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import pandas as pd

file_path = 'path_to_your_file.xlsx'
df_ = pd.read_excel(file_path)


df_['combined_text'] = df_['text2'].fillna("").astype('str')

df = df_.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42))

df = df.reset_index(drop=True)

print("Размер исходного датасета:", len(df_))
print("Размер выборки 30%:", len(df))
print("Примеры из выборки:")
print(df.head())

Размер исходного датасета: 226282
Размер выборки 30%: 67885
\Примеры из выборки:
                                               text1  \
0  \n\nФорма грудной клетки: обычная\nЛегочные по...   
1  \n\nФорма грудной клетки: обычная\nЛегочные по...   
2  \nВыполнена флюорография органов грудной клетк...   
3  \nКлиническая информация: в 2х проекциях\nФорм...   
4  \n Клиническая информация:\n Выполнена рентген...   

                                               text2  class  \
0             \nПатологических изменений не выявлено      0   
1  \nОчаговые и инфильтративные изменения не выяв...      0   
2            \nПатологических изменений не выявлено.      0   
3  \nРентгенологических признаков патологических ...      0   
4  \n Патологических изменений органов грудной кл...      0   

                                       combined_text  
0             \nПатологических изменений не выявлено  
1  \nОчаговые и инфильтративные изменения не выяв...  
2            \nПатологических изменени

  df = df_.groupby('class', group_keys=False).apply(lambda x: x.sample(frac=0.3, random_state=42))


In [None]:
from sklearn.model_selection import train_test_split

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['combined_text'].tolist(),
    df['class'].tolist(),
    test_size=0.2,
    random_state=42
)



In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
import torch

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encodings, train_labels)
test_dataset = TextDataset(test_encodings, test_labels)


In [None]:
from transformers import BertForSequenceClassification
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

model = model.to(device)

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
import wandb
from transformers.integrations import WandbCallback
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted'
    )
    acc = accuracy_score(labels, predictions)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


wandb.init(mode='offline', project='my_project')


training_args = TrainingArguments(
    output_dir='./results',
    run_name='my_unique_run_name',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    fp16=True,
    dataloader_num_workers=4,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=2,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model='f1'
)


if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[WandbCallback()]
)

trainer.train()

final_metrics = trainer.evaluate(test_dataset)
print("Финальные метрики на тестовых данных:")
print(f"Accuracy: {final_metrics['eval_accuracy']:.4f}")
print(f"F1-score: {final_metrics['eval_f1']:.4f}")
print(f"Precision: {final_metrics['eval_precision']:.4f}")
print(f"Recall: {final_metrics['eval_recall']:.4f}")

trainer.save_model('./saved_model')
tokenizer.save_pretrained('./saved_model')

You are adding a <class 'transformers.integrations.integration_utils.WandbCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is
:DefaultFlowCallback
WandbCallback


Using device: cuda




Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.1214,0.062311,0.984975,0.984974,0.984989,0.984975
1000,0.1573,0.183639,0.952272,0.952225,0.953405,0.952272
1500,0.0948,0.230517,0.945496,0.94549,0.945541,0.945496
2000,0.1139,0.238211,0.941887,0.941885,0.941894,0.941887
2500,0.2407,0.22174,0.944539,0.944534,0.944568,0.944539
3000,0.2566,0.18085,0.95242,0.952391,0.954368,0.95242
3500,0.0681,0.109759,0.976946,0.976945,0.97696,0.976946
4000,0.1089,0.114291,0.972601,0.972585,0.973199,0.972601
4500,0.6915,0.682855,0.52228,0.374316,0.754047,0.52228
5000,0.0637,0.104093,0.978493,0.978488,0.978663,0.978493





Финальные метрики на тестовых данных:
Accuracy: 0.9859
F1-score: 0.9859
Precision: 0.9860
Recall: 0.9859


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [None]:
trainer.evaluate()



{'eval_loss': 0.07611311227083206,
 'eval_accuracy': 0.9858584370626795,
 'eval_f1': 0.9858556565027133,
 'eval_precision': 0.9860019558437415,
 'eval_recall': 0.9858584370626795,
 'eval_runtime': 68.4657,
 'eval_samples_per_second': 198.304,
 'eval_steps_per_second': 12.4,
 'epoch': 2.999263514508764}