[대회링크](https://dacon.io/competitions/official/235670)

In [1]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from tqdm import tqdm

# Set environment variables for CUDA
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="4"

2023-05-11 01:09:12.207431: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-11 01:09:13.267398: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-05-11 01:09:13.267491: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64


In [2]:
# Load data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test_x.csv")
submission_df = pd.read_csv("sample_submission.csv")

# Preprocess text data
train_text = train_df['text'].values
train_labels = train_df['author'].values
test_text = test_df['text'].values

# Initialize the ALBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Tokenize the input text data using the ALBERT tokenizer
input_ids = []
attention_masks = []

for text in tqdm(train_text, desc="Preprocessing training data"):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 512,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(train_labels)

# Split the preprocessed training data into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = \
    train_test_split(input_ids, train_labels, random_state=42, test_size=0.1, stratify=train_df['author'])
train_masks, validation_masks, _, _ = \
    train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1, stratify=train_df['author'])

Preprocessing training data: 100% 54879/54879 [01:37<00:00, 562.84it/s]


In [3]:
# Set hyperparameters
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 16
epochs = 6
learning_rate = 5e-5

# Convert inputs, masks, and labels to tensors
train_inputs = torch.tensor(train_inputs)
train_masks = torch.tensor(train_masks)
train_labels = torch.tensor(train_labels)

validation_inputs = torch.tensor(validation_inputs)
validation_masks = torch.tensor(validation_masks)
validation_labels = torch.tensor(validation_labels)

  train_inputs = torch.tensor(train_inputs)
  train_masks = torch.tensor(train_masks)
  validation_inputs = torch.tensor(validation_inputs)
  validation_masks = torch.tensor(validation_masks)


In [4]:
# Create a TensorDataset
train_data = TensorDataset(train_inputs, train_masks, train_labels)
# Create a DataLoader for training data
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create a TensorDataset for validation data
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
# Create a DataLoader for validation data
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [5]:
# Load model
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)
model.to(device)

# Set optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [6]:
for epoch in range(1, epochs+1):
    model.train()
    
    train_loss = 0
    for batch in tqdm(train_dataloader):
        input_ids = batch[0].to(device=device)
        attention_masks = batch[1].to(device=device)
        labels = batch[2].to(device=device)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
    scheduler.step()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch}/{epochs}, training loss: {avg_train_loss}')

    model.eval()
    
    validation_loss = 0
    for batch in tqdm(validation_dataloader):
        input_ids = batch[0].to(device=device)
        attention_masks = batch[1].to(device=device)
        labels = batch[2].to(device=device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            validation_loss += loss.item()
        
    avg_val_loss = validation_loss / len(validation_dataloader)
    val_log_loss = log_loss(labels.cpu(), logits.cpu())
    print(f'Epoch {epoch}/{epochs}, validation loss: {avg_val_loss}, validation log loss: {val_log_loss}')


100% 3087/3087 [40:33<00:00,  1.27it/s]


Epoch 1/6, training loss: 0.6045623816384451


100% 343/343 [01:34<00:00,  3.62it/s]


Epoch 1/6, validation loss: 0.45027743483032845, validation log loss: 2.6829309918653936


100% 3087/3087 [41:05<00:00,  1.25it/s]


Epoch 2/6, training loss: 0.21614108708571578


100% 343/343 [01:34<00:00,  3.63it/s]


Epoch 2/6, validation loss: 0.3460464434149748, validation log loss: 0.650592850377776


100% 3087/3087 [41:03<00:00,  1.25it/s]


Epoch 3/6, training loss: 0.1363319568375984


100% 343/343 [01:34<00:00,  3.65it/s]


Epoch 3/6, validation loss: 0.372121501202171, validation log loss: 0.47617828549968344


100% 3087/3087 [41:05<00:00,  1.25it/s]


Epoch 4/6, training loss: 0.1264295335699918


100% 343/343 [01:34<00:00,  3.63it/s]


Epoch 4/6, validation loss: 0.3727781330391428, validation log loss: 0.47824795415122456


100% 3087/3087 [41:03<00:00,  1.25it/s]


Epoch 5/6, training loss: 0.12486336524308676


100% 343/343 [01:34<00:00,  3.63it/s]


Epoch 5/6, validation loss: 0.37277173144205233, validation log loss: 0.4779817892911771


100% 3087/3087 [41:03<00:00,  1.25it/s]


Epoch 6/6, training loss: 0.1248982445962738


100% 343/343 [01:34<00:00,  3.63it/s]

Epoch 6/6, validation loss: 0.3727719138271879, validation log loss: 0.47797693681228287





In [7]:
model.save_pretrained('bert-classifier')

In [8]:
# Preprocess the test data
test_text = test_df['text'].values

# Tokenize the input text data using the BERT tokenizer
input_ids = []
attention_masks = []

for text in tqdm(test_text, desc="Preprocessing test data"):
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = 512,
                        padding='max_length',
                        truncation=True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

# Set up the data loader for test data
test_data = TensorDataset(input_ids, attention_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

Preprocessing test data: 100% 19617/19617 [01:03<00:00, 307.57it/s]


In [9]:
# Make predictions
model.eval()
test_preds = []

for batch in tqdm(test_dataloader, desc="Making predictions"):
    input_ids = batch[0].to(device)
    attention_masks = batch[1].to(device)

    # Truncate the input sequences that are longer than 512 tokens
    if input_ids.shape[1] > 512:
        input_ids = input_ids[:, :512]
        attention_masks = attention_masks[:, :512]

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()

    test_preds.append(logits)

Making predictions: 100% 1227/1227 [05:34<00:00,  3.67it/s]


In [10]:
test_preds = np.concatenate(test_preds, axis=0)

# Create submission file
submission_df[['0', '1', '2', '3', '4']] = test_preds
submission_df.to_csv('BERT-b16-e6-StepLR-5e-5-gamma-0_1.csv', index=False)