In [3]:
!pip install -q --upgrade accelerate einops xformers

In [4]:
batch_size = 16 # 16
max_length = 512 # 512
epoch = 15
test_size = 0.01
threshold = 0.02
learning_rate = 5e-5
pos_weight = 472
gamma = 1
seed = 42

import torch
import torch.nn as nn

def get_loss_fn():
    return FocalLoss()

class FocalLoss(nn.Module):
    def __init__(self, gamma=gamma, pos_weight=pos_weight):
        super().__init__()
        self.gamma = gamma
        self.pos_weight = pos_weight

    def __call__(self, output, label):
        alpha = torch.where(label, self.pos_weight, 1)
        p = torch.sigmoid(output)
        pt = torch.where(label, p, 1-p)
        loss = - alpha * (1-pt).pow(self.gamma) * pt.log()
        return loss.mean()

    def __repr__(self):
        return f"FocalLoss(gamma={self.gamma}, pos_weight={self.pos_weight})"

In [5]:
import pandas as pd
import numpy as np
import argparse
import logging
import os
import json
from tqdm.auto import tqdm
from pathlib import Path
import torch.nn as nn
import math

import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

def set_seed():
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

def load_data():
    category_df = pd.read_csv('/kaggle/input/test-input/category.csv', dtype={'SSno': str})
    train_df = pd.read_csv('/kaggle/input/test-input/train.csv')
    train_set = Dataset.from_pandas(train_df)

    return train_set, category_df

def load_model():
    # hyunwoongko/kobart
    # jaehyeong/koelectra-base-v3-generalized-sentiment-analysis
    category_df = pd.read_csv('/kaggle/input/test-input/category.csv', dtype={'SSno': str})
    idx_to_SS = category_df.SSno.values
    SS_to_idx = {str(cat):idx for idx, cat in enumerate(idx_to_SS)}
    idx_to_SS = {value: key for key, value in SS_to_idx.items()}
    
    model_path = 'hyunwoongko/kobart'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path = model_path,
        num_labels = 564,
        id2label = idx_to_SS,
        label2id = SS_to_idx,
        ignore_mismatched_sizes=True,
    )
    
    def initialize_linear(layer):
        if isinstance(layer, nn.Linear):
            nn.init.kaiming_uniform_(layer.weight, a=math.sqrt(5))
            if layer.bias is not None:
                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(layer.weight)
                bound = 1 / math.sqrt(fan_in)
                nn.init.uniform_(layer.bias, -bound, bound)

    model.classification_head.dropout = nn.Identity()
    initialize_linear(model.classification_head.out_proj)
    return tokenizer, model

def preprocess_data(dataset, category_df):
    idx_to_SS = category_df.SSno.values
    SS_to_idx = {str(cat):idx for idx, cat in enumerate(idx_to_SS)}

    def preprocess_fn(example):
        title = example['invention_title']
        abstract = example['abstract']
        claims = example['claims']

        #texts = f"{title}"#요약: {abstract} 청구항: {claims}"
        #texts = f"{abstract}"
        texts = f"제목: {title} 요약: {abstract} 자세히: {claims}"
        labels = torch.zeros(len(SS_to_idx), dtype=torch.bool)

        for SSno in example['SSnos'].split():
            labels[SS_to_idx[SSno]] = 1

        return {
            'texts': texts,
            'labels': labels,
        }
    
    preprocessed = dataset.map(
        preprocess_fn,
        remove_columns=[
            col
            for col in dataset.column_names
            if col not in ['documentId']
        ],
    )
    return preprocessed

def tokenize_data(dataset, tokenizer):
    def batch_tokenize(batch):
        tokenized_batch = tokenizer(
            batch['texts'],
            max_length=max_length,
            padding='max_length',
            truncation=True,
        )
        tokenized_batch['documentId'] = batch['documentId']
        tokenized_batch['labels'] = batch['labels']
        return tokenized_batch
    
    tokenized = dataset.map(
        batch_tokenize,
        batched=True,
    )
    
    return tokenized

def split_data(dataset):
    dataset = dataset.train_test_split(
        test_size = test_size,
        seed = 42,
    )
    return dataset
    
class CustomTrainer(Trainer):
    def __init__(self, *args, loss_fn, metric, **kargs):
        super().__init__(*args, **kargs)
        self.loss_fn = loss_fn
        self.metric = metric

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
        )
        loss = self.loss_fn(outputs.logits, inputs["labels"])
        return (loss, outputs) if return_outputs else loss


def get_trainer(model, tokenizer, dataset, metric):
    training_args = TrainingArguments(
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=epoch,
        per_device_train_batch_size=batch_size,
        optim="adamw_torch",
        learning_rate=learning_rate,
        warmup_steps=200,
        output_dir="./results",
        save_total_limit=3,
        report_to=list([]),
    )
    
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    loss_fn = get_loss_fn()
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        tokenizer=tokenizer,
        loss_fn=loss_fn,
        metric = metric,  # Pass the metric here
    )
    return trainer

from transformers import AdamW
def main():
    set_seed()
    #df = pd.read_csv('/kaggle/input/test-input/train.csv')
    #kf = KFold(n_splits=5, shuffle=True, random_state=42)
    #for train_index, test_index in kf.split(df):
    # train_data = df.iloc[train_index]
    dataset, category_df = load_data()
    tokenizer, model = load_model()

    dataset = preprocess_data(dataset, category_df)
    dataset = tokenize_data(dataset, tokenizer)
    dataset = split_data(dataset)

    metric = load_metric("f1")  # Define the metric
    trainer = get_trainer(model, tokenizer, dataset, metric)  # Pass the metric

    #if os.path.exists('model_checkpoint.pt') and os.path.exists('optimizer_checkpoint.pt'):

    trainer.train('/kaggle/input/gramenandae/kaggle/working/results/checkpoint-37130')

if __name__ == '__main__':
    main()

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)okenizer_config.json:   0%|          | 0.00/337 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/109 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


Downloading pytorch_model.bin:   0%|          | 0.00/496M [00:00<?, ?B/s]

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at hyunwoongko/kobart and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60000 [00:00<?, ?ex/s]

  0%|          | 0/60 [00:00<?, ?ba/s]

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
11,0.0096,0.159412
12,0.0086,0.193064
13,0.0071,0.206906
14,0.0063,0.221778
15,0.0054,0.241073


In [8]:
#!zip -r ./checkpoint_5.zip /kaggle/working/results/checkpoint-37130

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  adding: kaggle/working/results/checkpoint-37130/ (stored 0%)
  adding: kaggle/working/results/checkpoint-37130/pytorch_model.bin (deflated 7%)
  adding: kaggle/working/results/checkpoint-37130/tokenizer.json (deflated 75%)
  adding: kaggle/working/results/checkpoint-37130/tokenizer_config.json (deflated 30%)
  adding: kaggle/working/results/checkpoint-37130/special_tokens_map.json (deflated 49%)
  adding: kaggle/working/results/checkpoint-37130/training_args.bin (deflated 49%)
  adding: kaggle/working/results/checkpoint-37130/scheduler.pt (deflated 49%)
  adding: kaggle/working/results/checkpoint-37130/config.json (deflated 73%)
  adding: kaggle/working/results/checkpoint-37130/trainer_state.json (deflated

In [8]:
import pandas as pd
import numpy as np
import argparse
import logging
import os
import json
from tqdm.auto import tqdm
from pathlib import Path

import random
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from transformers import default_data_collator

category_df = pd.read_csv('/kaggle/input/test-input/category.csv', dtype={'SSno': str})
idx_to_SS = category_df.SSno.values
SS_to_idx = {str(cat):idx for idx, cat in enumerate(idx_to_SS)}
idx_to_SS = {value: key for key, value in SS_to_idx.items()}

def load_data():
    category_df = pd.read_csv('/kaggle/input/test-input/category.csv', dtype={'SSno': str})
    test_df = pd.read_csv('/kaggle/input/test-input/test_input.csv')
    test_set = Dataset.from_pandas(test_df)

    return test_set, category_df

def load_model():
    # hyunwoongko/kobart
    # jaehyeong/koelectra-base-v3-generalized-sentiment-analysis
    model_path = 'hyunwoongko/kobart'
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        pretrained_model_name_or_path = model_path,
        num_labels = 564,
        id2label = idx_to_SS,
        label2id = SS_to_idx,
        ignore_mismatched_sizes=True,
    )
    
    model.classification_head.dropout = nn.Identity()
    model.load_state_dict(torch.load('/kaggle/working/results/checkpoint-55695/pytorch_model.bin'))
    # 7428 9285 11142 12999 14856
    return tokenizer, model

def preprocess_data(dataset):
    def preprocess_fn(example):
        title = example['invention_title']
        abstract = example['abstract']
        claims = example['claims']

        #texts = f"{title}"#요약: {abstract} 청구항: {claims}"
        #texts = f"{abstract}"
        texts = f"제목: {title} 요약: {abstract} 청구항: {claims}"
        return {
            'texts': texts,
        }
    
    preprocessed = dataset.map(
        preprocess_fn,
        remove_columns=[
            col
            for col in dataset.column_names
            if col not in ['documentId']
        ],
    )
    return preprocessed

def tokenize_data(dataset, tokenizer):
    def batch_tokenize(batch):
        tokenized_batch = tokenizer(
            batch['texts'],
            max_length=max_length,
            padding='max_length',
            truncation=True,
        )
        tokenized_batch['documentId'] = batch['documentId']
        return tokenized_batch
    
    tokenized = dataset.map(
        batch_tokenize,
        batched=True,
    )
    
    return tokenized

def pred(dataset, model, tokenizer):
    device = 'cuda'

    test_loader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle=False,
        collate_fn=default_data_collator,
    )

    model.to(device)
    model.eval()

    result_ids = []
    result_logits = []
    for batch in tqdm(test_loader):
        with torch.no_grad():
            outputs = model(
                input_ids = batch['input_ids'].to(device),
                attention_mask = batch['attention_mask'].to(device),
            )
            result_ids.append(batch['documentId'].numpy())
            predictions = torch.sigmoid(outputs.logits)
            result_logits.append(predictions.detach().cpu().numpy())
    
    ids = np.concatenate(result_ids)
    logits = np.concatenate(result_logits)

    return ids, logits

def get_true_indices(arr):
    true_indices = []

    for row in arr:
        row_indices = np.where(row)[0]
        value = []
        for v in row_indices.tolist():
            value.append(idx_to_SS[v])
        true_indices.append(' '.join(value))
    return true_indices

def filter_tensor(tensor, th):
    max_values = np.max(tensor, axis=1)  # 텐서에서 가장 큰 값
    filtered_tensor = tensor >= max_values[:, np.newaxis]*(1 - th)
    
    return filtered_tensor

def save_submission(ids, preds, category_df):
    idx_to_SSno = category_df.SSno.values
    
    pred = filter_tensor(preds, 0.025)

    result = get_true_indices(pred)
    
    submission = pd.DataFrame({'documentId':ids, 'SSnos':result})
    submission.to_csv('submission.csv', index=False)
    
    cnt = 0 
    for i in result:
        if len(i) == 5:
            cnt += 1

    print(cnt)

def main():
    dataset, category_df = load_data()
    tokenizer, model = load_model()

    dataset = preprocess_data(dataset)
    dataset = tokenize_data(dataset, tokenizer)

    ids, preds = pred(dataset, model, tokenizer)
    save_submission(ids, preds, category_df)

if __name__ == '__main__':
    main()


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at hyunwoongko/kobart and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/10000 [00:00<?, ?ex/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/625 [00:00<?, ?it/s]

8377
