In [1]:
import transformers

In [2]:
# Import required libraries:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import os
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Set our random seed:
SEED = 17

In [3]:
train_df = pd.read_csv('/kaggle/input/kagglecomp/train.csv')[['id', 'text', 'target']]
test_df = pd.read_csv('/kaggle/input/kagglecomp/test.csv')[['id', 'text']]

In [31]:
# Инициализация токенизатора и модели BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# Проверка доступности GPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [33]:
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        inputs = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        
        if 'target' in self.data.columns:
            label = torch.tensor(self.data.iloc[idx]['target'], dtype=torch.long)
            return input_ids, attention_mask, label
        else:
            return input_ids, attention_mask

In [34]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Создание DataLoader для обучения и валидации
train_dataset = TweetDataset(train_data, tokenizer)
val_dataset = TweetDataset(val_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [35]:
optimizer = AdamW(model.parameters(), lr=1e-6)

In [36]:
def train_model(model, train_loader, val_loader, device, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss, total_val_loss = 0, 0
        correct_predictions = 0

        # Обучение
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        # Валидация
        model.eval()
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                total_val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                correct_predictions += torch.sum(preds == labels).item()

        accuracy = correct_predictions / len(val_loader.dataset)
        print(f'Epoch {epoch + 1}, Train Loss: {total_loss / len(train_loader)}, Val Loss: {total_val_loss / len(val_loader)}, Val Accuracy: {accuracy}')

In [37]:
train_model(model, train_loader, val_loader, device)

# Подготовка данных для предсказаний на тестовой выборке
test_dataset = TweetDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=16)

Epoch 1, Train Loss: 0.6364290918421558, Val Loss: 0.5316555071622133, Val Accuracy: 0.7859487852921865
Epoch 2, Train Loss: 0.4783213500745027, Val Loss: 0.4353235368616879, Val Accuracy: 0.8220617202889035
Epoch 3, Train Loss: 0.39661263371389993, Val Loss: 0.40621344139799476, Val Accuracy: 0.8312541037426132


In [38]:
model.eval()
predictions = []
for batch in test_loader:
    input_ids, attention_mask = [b.to(device) for b in batch]
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())

In [39]:
submission = pd.DataFrame({'id': test_df['id'], 'target': predictions})
submission.to_csv('submission_bert.csv', index=False)