## 낚시성 기사 분류

<br>

[낚시성 기사 탐지 데이터](https://www.aihub.or.kr/aihubdata/data/view.do?currMenu=115&topMenu=100&aihubDataSe=realm&dataSetSn=71338)
<br>[낚시성 기사 분류](https://aifactory.space/task/2663/overview)

<br>

<b>Load Data</b>
<br>구글 드라이브에 저장된 AIHUB Data를 Load하여 압축 해제

<br><b>Model Training</b>
<br>낚시성 기사 분류 Task에 대한 Training 진행

<br><b>Model Test</b>
<br>기사의 낚시성 예측값(0, 1)을 pred_y.csv에 저장

<br><b>Submission</b>
<br>pred_y.csv를 리더보드에 제출하여 채점 진행


<br>

### GPU Information

In [None]:
!nvidia-smi

### Load Data

In [None]:
!pip install gdown

In [None]:
!pip install transformers
# !pip install transformers==4.29.2
# !pip install safetensors==0.3.0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_id = "1q1nO2sUI8j_CgIhs79QIYBzhLCZ1O9En"
url = f"https://drive.google.com/uc?id={file_id}"

In [None]:
import os

before_files = set(os.listdir())

!gdown {url}

after_files = set(os.listdir())

downloaded_files = after_files - before_files
if downloaded_files:
    filename = downloaded_files.pop()
    print(f"Downloaded file: {filename}")
    downloaded_filepath = filename
else:
    print("No file downloaded.")

Downloading...
From: https://drive.google.com/uc?id=1q1nO2sUI8j_CgIhs79QIYBzhLCZ1O9En
To: /content/aihub_clickbait_classification_data.zip
100% 57.3M/57.3M [00:00<00:00, 72.4MB/s]
Downloaded file: aihub_clickbait_classification_data.zip


In [None]:
!unzip {downloaded_filepath}

Archive:  aihub_clickbait_classification_data.zip
   creating: aihub_clickbait_classification_data/
  inflating: aihub_clickbait_classification_data/test_x.csv  
  inflating: aihub_clickbait_classification_data/train.csv  


### Model Training

In [None]:
import sys
import os
import datetime
import argparse
import pandas as pd
from tqdm import tqdm

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW


def main(pretrained_model, train_data_path,  model_save_folder, epoch, batch_size):

    # os.mkdir(model_save_folder)

    class ClickbaitDetectionDataset(Dataset):

        def __init__(self, dataset):
            self.dataset = dataset
            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            row = self.dataset.iloc[idx, 0:2].values
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                max_length=256,
                pad_to_max_length=True,
                add_special_tokens=True
                )

            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask, y

    if torch.cuda.is_available() == True:
        device = torch.device("cuda:0")
        model = RobertaForSequenceClassification.from_pretrained(pretrained_model).to(device)
    elif torch.cuda.is_available() == False:
        model = RobertaForSequenceClassification.from_pretrained(pretrained_model)

    train_data = pd.read_csv(train_data_path)
    train_dataset = ClickbaitDetectionDataset(train_data)

    optimizer = AdamW(model.parameters(), lr=1e-5)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    train_time = []
    train_loss = []
    train_accuracy = []

    for i in range(epoch):
        total_loss = 0.0
        correct = 0
        total = 0
        batches = 0

        model.train()

        with tqdm(train_loader) as pbar:
            pbar.set_description("Epoch " + str(i + 1))
            for input_ids_batch, attention_masks_batch, y_batch in pbar:
                optimizer.zero_grad()

                if torch.cuda.is_available() == True:
                    y_batch = y_batch.to(device)
                    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]

                elif torch.cuda.is_available() == False:
                    y_batch = y_batch
                    y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]


                one_loss = F.cross_entropy(y_pred, y_batch)
                one_loss.backward()
                optimizer.step()

                total_loss += one_loss.item()

                _, predicted = torch.max(y_pred, 1)
                correct += (predicted == y_batch).sum()
                total += len(y_batch)

                batches += 1
                # if batches % 100 == 0:
                # print("Batch Loss:", total_loss, "Accuracy:", correct.float() / total)

                elapsed = pbar.format_dict['elapsed']
                elapsed_str = pbar.format_interval(elapsed)


        if len(elapsed_str) == 5:
            elapsed_str = "00:" + elapsed_str
        elapsed_str = str(datetime.datetime.strptime(elapsed_str, '%H:%M:%S').time())

        pbar.close()
        train_time.append(elapsed_str)
        total_loss = round(total_loss, 4)
        train_loss.append(total_loss)
        accuracy = round((correct.float() / total).item(), 4)
        train_accuracy.append(accuracy)
        print("Train Time",  elapsed_str, "  ", "Train Loss:", total_loss,  "  ",  "Train Accuracy:", accuracy)

        torch.save(model.state_dict(), model_save_folder + "clickbait_classifcation_model_" + str(i + 1) + ".bin")

if __name__ == '__main__':

    pretrained_model = 'klue/roberta-small'
    train_data_path = "aihub_clickbait_classification_data/train.csv"
    model_save_folder =  "./model/"
    epoch = 10
    batch_size = 64
    main(pretrained_model, train_data_path, model_save_folder, epoch, batch_size)

Downloading pytorch_model.bin:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

Downloading (…)okenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/752k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 908/908 [03:08<00:00,  4.83it/s]


Train Time 00:03:08    Train Loss: 587.3607    Train Accuracy: 0.6042


Epoch 2: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 518.0104    Train Accuracy: 0.6952


Epoch 3: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 467.4494    Train Accuracy: 0.7391


Epoch 4: 100%|██████████| 908/908 [03:08<00:00,  4.82it/s]


Train Time 00:03:08    Train Loss: 413.0478    Train Accuracy: 0.7814


Epoch 5: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 353.5881    Train Accuracy: 0.8232


Epoch 6: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 295.2479    Train Accuracy: 0.8592


Epoch 7: 100%|██████████| 908/908 [03:09<00:00,  4.79it/s]


Train Time 00:03:09    Train Loss: 237.5199    Train Accuracy: 0.8918


Epoch 8: 100%|██████████| 908/908 [03:09<00:00,  4.78it/s]


Train Time 00:03:09    Train Loss: 191.0845    Train Accuracy: 0.9149


Epoch 9: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 153.3742    Train Accuracy: 0.9338


Epoch 10: 100%|██████████| 908/908 [03:08<00:00,  4.81it/s]


Train Time 00:03:08    Train Loss: 121.9972    Train Accuracy: 0.9477


### Model Test

In [None]:
import sys
import os
import datetime
import regex as re
import argparse
import pandas as pd
from tqdm import tqdm

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, RobertaForSequenceClassification


def main(pretrained_model, test_data_path, load_model_path, predction_csv_path, epoch, batch_size):

    class ClickbaitDetectionDataset(Dataset):

        def __init__(self, dataset):
            self.dataset = dataset
            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model)

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            row = self.dataset.iloc[idx, 0:1].values
            text = row[0]

            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                max_length=256,
                pad_to_max_length=True,
                add_special_tokens=True
                )

            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask

    if torch.cuda.is_available() == True:
        device = torch.device("cuda")
        model = RobertaForSequenceClassification.from_pretrained(pretrained_model).to(device)
    elif torch.cuda.is_available() == False:
        model = RobertaForSequenceClassification.from_pretrained(pretrained_model)

    checkpoint = torch.load(os.path.join(load_model_path, "clickbait_classifcation_model_" + str(epoch) + ".bin"))
    model.load_state_dict(checkpoint)
    model.eval()

    test_data = pd.read_csv(test_data_path)
    test_dataset = ClickbaitDetectionDataset(test_data)

    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    predicted_list = []

    with tqdm(test_loader) as pbar:
        for input_ids_batch, attention_masks_batch in pbar:

            if torch.cuda.is_available() == True:
                y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]

            elif torch.cuda.is_available() == False:
                y_pred = model(input_ids_batch, attention_mask=attention_masks_batch)[0]

            _, predicted = torch.max(y_pred, 1)

            for prediction in predicted.tolist():
                predicted_list.append(prediction)

            elapsed = pbar.format_dict['elapsed']
            elapsed_str = pbar.format_interval(elapsed)


        if len(elapsed_str) == 5:
            elapsed_str = "00:" + elapsed_str
        elapsed_str = str(datetime.datetime.strptime(elapsed_str, '%H:%M:%S').time())

        pbar.close()
        print("Test Time",  elapsed_str)

    y_pred_csv = pd.DataFrame({"Prediction": predicted_list})
    y_pred_csv.to_csv(predction_csv_path, index=False)

if __name__ == '__main__':

    pretrained_model = 'klue/roberta-small'
    test_data_path = "aihub_clickbait_classification_data/test_x.csv"
    load_model_path =  "./model/"
    predction_csv_path = "log/pred_y.csv"
    epoch = 5
    batch_size = 64
    main(pretrained_model, test_data_path, load_model_path, predction_csv_path, epoch, batch_size)

Some weights of the model checkpoint at klue/roberta-small were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

Test Time 00:00:01





### Gradio

In [None]:
!pip install gradio
!pip install --upgrade typing-extensions

In [None]:
import gradio as gr
import pandas as pd
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification


def main(text):

    class ClickbaitDetectionDataset(Dataset):

        def __init__(self, dataset):
            self.dataset = dataset
            self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

        def __len__(self):
            return len(self.dataset)

        def __getitem__(self, idx):
            row = self.dataset.iloc[idx, 0:2].values
            text = row[0]
            y = row[1]

            inputs = self.tokenizer(
                text,
                return_tensors='pt',
                truncation=True,
                max_length=256,
                pad_to_max_length=True,
                add_special_tokens=True
                )

            input_ids = inputs['input_ids'][0]
            attention_mask = inputs['attention_mask'][0]

            return input_ids, attention_mask, y

    device = torch.device("cuda")
    model = AutoModelForSequenceClassification.from_pretrained("klue/roberta-base").to(device)
    checkpoint = torch.load("model/clickbait_classifcation_model_5.bin")
    model.load_state_dict(checkpoint)
    model.eval()

    test_data = pd.DataFrame({"Text":[text]})
    test_dataset = ClickbaitDetectionDataset(test_data)

    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

    with torch.no_grad():
        for input_ids_batch, attention_masks_batch in test_loader:
            # optimizer.zero_grad()

            y_batch = y_batch.to(device)
            y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
            _, predicted = torch.max(y_pred, 1)

            for prediction in predicted.tolist():
                if prediction == 0:
                    classfication_result = "It's Not Clickbait Article"
                elif prediction == 1:
                    classfication_result =  "It's Clickbait Article"

    return classfication_result


if __name__ == '__main__':
    demo = gr.Interface(fn=main, inputs="text", outputs="text")
    demo.launch()
    # demo.launch( share = True , debug = True)