In [1]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel
from torch.optim import AdamW  # Импортируем AdamW из torch.optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, Dataset

# Шаг 1: Загрузка предобученной модели и токенизатора
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Шаг 2: Создание пользовательского Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        return inputs, label

# Пример данных
texts = ["This is a positive example.", "This is a negative example."]
labels = [1, 0]  # Предположим, 1 - положительный, 0 - отрицательный

# Разделение данных на обучающую и тестовую выборки
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2)

# Создание DataLoader
train_dataset = TextDataset(train_texts, train_labels)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)

# Шаг 3: Определение модели
class BertClassifier(nn.Module):
    def __init__(self):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.fc = nn.Linear(bert_model.config.hidden_size, 2)  # 2 класса

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.fc(outputs.pooler_output)
        return logits

# Инициализация модели
model = BertClassifier()
optimizer = AdamW(model.parameters(), lr=1e-5)  # Используем AdamW из torch.optim

# Шаг 4: Обучение модели
model.train()
for epoch in range(3):  # Обучение на 3 эпохи
    for batch in train_loader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)
        labels = labels.to(torch.long)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()

# Шаг 5: Оценка модели
model.eval()
test_dataset = TextDataset(test_texts, test_labels)
test_loader = DataLoader(test_dataset, batch_size=2)

predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.numpy())
        true_labels.extend(labels.numpy())

# Вывод точности
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 0.00%


In [5]:
!pip install transformers



In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-18.0.0-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.2-cp311-cp311-win_amd64.whl.metadata (8.0 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Downloading aiohappyeyeballs-2.4.3-py3

In [8]:
from datasets import load_dataset

# Загрузка набора данных IMDb
dataset = load_dataset("imdb")

# Пример данных
train_texts = dataset['train']['text'][:1000]  # Используем первые 1000 примеров для обучения
train_labels = dataset['train']['label'][:1000]
test_texts = dataset['test']['text'][:200]  # Используем 200 примеров для тестирования
test_labels = dataset['test']['label'][:200]

# Разделение данных на обучающую и тестовую выборки
train_dataset = TextDataset(train_texts, train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Увеличиваем размер батча

# Обучение модели
model.train()
for epoch in range(3):  # Обучение на 3 эпохи
    for batch in train_loader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)
        labels = labels.to(torch.long)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()

# Оценка модели
model.eval()
test_dataset = TextDataset(test_texts, test_labels)
test_loader = DataLoader(test_dataset, batch_size=8)

predictions, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        inputs, labels = batch
        input_ids = inputs['input_ids'].squeeze(1)
        attention_mask = inputs['attention_mask'].squeeze(1)

        outputs = model(input_ids, attention_mask)
        preds = torch.argmax(outputs, dim=1)
        predictions.extend(preds.numpy())
        true_labels.extend(labels.numpy())

# Вывод точности
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy * 100:.2f}%')


TypeError: TextDataset.__init__() missing 1 required positional argument: 'tokenizer'

In [7]:
from transformers import BertTokenizer
from torch.utils.data import DataLoader, Dataset
import torch

# Пример вашего текстового набора данных
texts = ["This is a short sentence.", "This is a much longer sentence than the previous one."]
labels = [0, 1]  # Пример меток

# Инициализация токенизатора
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Создание пользовательского набора данных
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Токенизация с паддингом
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,  # Добавляет [CLS] и [SEP]
            max_length=self.max_length,
            padding='max_length',  # Паддинг до max_length
            truncation=True,  # Обрезка длинных последовательностей
            return_tensors='pt'  # Возвращает PyTorch тензоры
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Создание набора данных и загрузчика данных
dataset = TextDataset(texts, labels, tokenizer)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Использование DataLoader
for batch in data_loader:
    print(batch['input_ids'])
    print(batch['attention_mask'])
    print(batch['label'])


tensor([[ 101, 2023, 2003,  ...,    0,    0,    0],
        [ 101, 2023, 2003,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([0, 1])


In [9]:
!pip install transformers torch



In [10]:
# Пример текстов и меток
texts = [
    "I love programming.",
    "Python is great for data science.",
    "I hate bugs in my code.",
    "Debugging is fun!",
    "I dislike syntax errors."
]
labels = [1, 1, 0, 1, 0]  # 1 - положительный, 0 - отрицательный


In [11]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Токенизация с паддингом
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [12]:
from transformers import BertTokenizer, BertForSequenceClassification

# Загрузка токенизатора и модели
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)  # 2 класса


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Создание набора данных и загрузчика данных
dataset = TextDataset(texts, labels, tokenizer)
data_loader = DataLoader(dataset, batch_size=2, shuffle=True)

# Оптимизатор
optimizer = AdamW(model.parameters(), lr=1e-5)

# Обучение модели
model.train()
for epoch in range(3):  # 3 эпохи
    print(f'Epoch {epoch + 1}')
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        # Прямой проход
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Обратный проход
        loss.backward()
        optimizer.step()

        print(f'Loss: {loss.item()}')


Epoch 1


 33%|███▎      | 1/3 [00:03<00:07,  3.98s/it]

Loss: 0.6848486661911011


 67%|██████▋   | 2/3 [00:07<00:03,  3.91s/it]

Loss: 0.7694594860076904


100%|██████████| 3/3 [00:09<00:00,  3.11s/it]


Loss: 0.6712468862533569
Epoch 2


 33%|███▎      | 1/3 [00:02<00:04,  2.30s/it]

Loss: 0.5280494093894958


 67%|██████▋   | 2/3 [00:04<00:02,  2.29s/it]

Loss: 0.689810574054718


100%|██████████| 3/3 [00:05<00:00,  1.99s/it]


Loss: 0.565945565700531
Epoch 3


 33%|███▎      | 1/3 [00:02<00:04,  2.38s/it]

Loss: 0.5241571664810181


 67%|██████▋   | 2/3 [00:04<00:02,  2.37s/it]

Loss: 0.7262871265411377


100%|██████████| 3/3 [00:06<00:00,  2.06s/it]

Loss: 0.5807842016220093





In [14]:
def predict(text):
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()
    
    return predicted_class

# Пример предсказания
print(predict("I enjoy solving problems."))  # Ожидаем положительный результат
print(predict("I don't like this at all."))  # Ожидаем отрицательный результат


0
1


In [15]:
# Обучение модели
model.train()
for epoch in range(3):  # 3 эпохи
    print(f'Epoch {epoch + 1}')
    for batch in tqdm(data_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        # Прямой проход
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Обратный проход
        loss.backward()
        optimizer.step()

        print(f'Loss: {loss.item()}')


Epoch 1


 33%|███▎      | 1/3 [00:02<00:04,  2.24s/it]

Loss: 0.7306695580482483


 67%|██████▋   | 2/3 [00:04<00:02,  2.20s/it]

Loss: 0.613429605960846


100%|██████████| 3/3 [00:05<00:00,  1.90s/it]


Loss: 0.5766921043395996
Epoch 2


 33%|███▎      | 1/3 [00:02<00:04,  2.18s/it]

Loss: 0.6477982401847839


 67%|██████▋   | 2/3 [00:04<00:02,  2.20s/it]

Loss: 0.4646878242492676


100%|██████████| 3/3 [00:05<00:00,  1.88s/it]


Loss: 0.7364324927330017
Epoch 3


 33%|███▎      | 1/3 [00:02<00:04,  2.25s/it]

Loss: 0.5602341890335083


 67%|██████▋   | 2/3 [00:04<00:02,  2.32s/it]

Loss: 0.5413945913314819


100%|██████████| 3/3 [00:05<00:00,  1.99s/it]

Loss: 0.5561627149581909



