In [1]:
! pip install transformers
! pip install torch --upgrade


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/8f/e9/c2b4c823b3959d475a570c1bd2df4125478e2e37b96fb967a87933ae7134/transformers-4.18.0-py3-none-any.whl (4.0MB)
[K     |████████████████████████████████| 4.0MB 1.2MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
[?25l  Downloading https://files.pythonhosted.org/packages/36/22/26b08c841c0493908b4be6960ec2be14a21d1ec0f42ae0cedbca5599ad3d/tokenizers-0.12.1-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6MB)
[K     |████████████████████████████████| 6.6MB 34.2MB/s eta 0:00:01
Collecting packaging>=20.0
[?25l  Downloading https://files.pythonhosted.org/packages/05/8e/8de486cbd03baba4deef4142bd643a3e7bbe954a784dc1bb17142572d127/packaging-21.3-py3-none-any.whl (40kB)
[K     |████████████████████████████████| 40kB 5.5MB/s  eta 0:00:01
[?25hCollecting numpy>=1.17
[?25l  Downloading https://files.pythonhosted.org/packages/14/32/d3fa649ad7ec0b82737b92fefd3c4dd376b0bb23730715124569

In [2]:
import os
from typing import Tuple, List
from functools import partial
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [3]:
# 파일 경로 및 BERT 모델 이름 설정
path = "./kaggle/input/jigsaw-toxic-comment-classification-challenge/"
bert_model_name = 'bert-base-cased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 데이터 로드
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
train_df, val_df = train_test_split(train_df, test_size=0.05)

# 토크나이저 설정
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

HBox(children=(IntProgress(value=0, description='Downloading', max=213450, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=49, style=ProgressStyle(description_width='…




HBox(children=(IntProgress(value=0, description='Downloading', max=570, style=ProgressStyle(description_width=…




In [4]:
# 데이터셋 클래스 정의
class ToxicDataset(Dataset):
    def __init__(self, tokenizer, dataframe):
        self.tokenizer = tokenizer
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        tokens = self.tokenizer.encode(row["comment_text"], add_special_tokens=True, max_length=120, truncation=True)
        x = torch.tensor(tokens)
        y = torch.tensor(row[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]], dtype=torch.float)
        return x, y

# 데이터셋 생성
train_dataset = ToxicDataset(tokenizer, train_df)
val_dataset = ToxicDataset(tokenizer, val_df)

In [5]:
# 데이터로더 생성
def collate_fn(batch):
    x, y = zip(*batch)
    x = pad_sequence(x, batch_first=True, padding_value=tokenizer.pad_token_id)
    y = torch.stack(y)
    return x.to(device), y.to(device)

train_loader = DataLoader(train_dataset, batch_size=32, sampler=RandomSampler(train_dataset), collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, sampler=RandomSampler(val_dataset), collate_fn=collate_fn)

In [6]:
# 모델 정의
class BertClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        cls_output = outputs.pooler_output
        cls_output = self.classifier(cls_output)
        return torch.sigmoid(cls_output)

model = BertClassifier(bert_model_name, 6).to(device)

HBox(children=(IntProgress(value=0, description='Downloading', max=435779157, style=ProgressStyle(description_…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
# 옵티마이저와 스케줄러 설정
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 2
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [8]:
# 학습 함수 정의
def train(model, loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for x, y in tqdm(loader):
        optimizer.zero_grad()
        mask = (x != tokenizer.pad_token_id).float()
        outputs = model(x, attention_mask=mask)
        loss = nn.BCELoss()(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    return total_loss / len(loader)

In [9]:
# 평가 함수 정의
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    true_labels, predictions = [], []
    with torch.no_grad():
        for x, y in tqdm(loader):
            mask = (x != tokenizer.pad_token_id).float()
            outputs = model(x, attention_mask=mask)
            loss = nn.BCELoss()(outputs, y)
            total_loss += loss.item()
            true_labels.append(y.cpu().numpy())
            predictions.append(outputs.cpu().numpy())
    
    true_labels = np.concatenate(true_labels)
    predictions = np.concatenate(predictions)
    auc_scores = [roc_auc_score(true_labels[:, i], predictions[:, i]) for i in range(6)]
    return total_loss / len(loader), auc_scores

In [None]:
# 학습 및 평가
for epoch in range(1):
    print(f'Epoch {epoch + 1}/{2}')
    train_loss = train(model, train_loader, optimizer, scheduler)
    val_loss, auc_scores = evaluate(model, val_loader)
    print(f'Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f}')
    for i, score in enumerate(auc_scores):
        print(f'Class {i} AUC: {score:.4f}')

  0%|          | 0/4738 [00:00<?, ?it/s]

Epoch 1/2


 69%|██████▉   | 3291/4738 [40:03<17:35,  1.37it/s]

In [None]:
# 테스트 데이터 예측
test_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip')
submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip')

model.eval()
for i in tqdm(range(len(test_df) // 32 + 1)):
    batch_df = test_df.iloc[i * 32: (i + 1) * 32]
    texts = [tokenizer.encode(text, add_special_tokens=True, max_length=120, truncation=True) for text in batch_df["comment_text"]]
    x = pad_sequence([torch.tensor(text) for text in texts], batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
    mask = (x != tokenizer.pad_token_id).float()
    with torch.no_grad():
        outputs = model(x, attention_mask=mask).cpu().numpy()
    submission.iloc[i * 32: (i + 1) * 32, 1:] = outputs

submission.to_csv("submission.csv", index=False)