In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import random
from tqdm import tqdm

2022-08-17 17:36:43.473002: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# normal data load
strongbuy = pd.read_csv("./data/strongbuy.csv")
sell = pd.read_csv("./data/sell.csv")
holddown = pd.read_csv("./data/holddown.csv")

In [3]:
# expand eata load
strongbuy_expand_google_en = pd.read_csv("./data/strongbuy_expand_google_en.csv")
strongbuy_expand_google_zhcn = pd.read_csv("./data/strongbuy_expand_google_zhcn.csv")
strongbuy_expand_pororo_en = pd.read_csv("./data/strongbuy_expand_pororo_en.csv")

sell_expand_google_en = pd.read_csv("./data/sell_expand_google_en.csv")
sell_expand_google_zhcn = pd.read_csv("./data/sell_expand_google_zhcn.csv")
sell_expand_pororo_en = pd.read_csv("./data/sell_expand_pororo_en.csv")

holddown_expand_google_en = pd.read_csv("./data/holddown_expand_google_en.csv")
holddown_expand_google_zhcn = pd.read_csv("./data/holddown_expand_google_zhcn.csv")
holddown_expand_pororo_en = pd.read_csv("./data/holddown_expand_pororo_en.csv")

In [4]:
# all_data = pd.concat([strongbuy, sell, holddown])

all_data = pd.concat([strongbuy, strongbuy_expand_google_en, strongbuy_expand_google_zhcn, strongbuy_expand_pororo_en,
                      sell, sell_expand_google_en, sell_expand_google_zhcn, sell_expand_pororo_en,
                      holddown, holddown_expand_google_en, holddown_expand_google_zhcn, holddown_expand_pororo_en])
all_data.sample(frac=1).reset_index(drop=True)

Unnamed: 0,filename,article,length,label
0,strongbuy65.txt,"상승, 2Q15의 주가 : KRW 365.3 억 유로의 매출, 421 억 유로 (O...",382,1
1,strongbuy234.txt,"2021년 배터리 소재인데, 아직도 PER 7배? 2020년 위기에도 강한 실적 복...",362,1
2,strongbuy44.txt,"6년 만에 처음으로 소송을 마무리하면서 29억 달러를 배포하고, 2009년부터 시작...",199,1
3,holddown126.txt,1Q는 약간 낮습니다.좋은 외모 성장.170 만 원의 낮은 이익 마진 중에 회사는 ...,429,0
4,302.txt,이 회사는 4 분기에 상당한 선제 명령을 받았으며 장기 헌장 선박의 평균 비용은 현...,347,0
...,...,...,...,...
10995,보유하향57.txt,"2분기 대형 손실회사는 1조6,94억원의 순손실, 영업손실 17억4,000억원, 순...",218,0
10996,holddown113.txt,"3Q21 Review: 특이사항 없음 3Q21 매출액 606억원(YoY +13%),...",728,0
10997,strongbuy432.txt,"렌터카 업체 대부분이 EV,EBITDA 기준으로 5배 이상 평가받는 이유는 핵심 자...",404,1
10998,strongbuy410.txt,"투자 의견은 강력하게 구입하고 목표 가격은 25,000 원, 목표 가격은 25,00...",340,1


In [5]:
print(f"{len(strongbuy)/len(strongbuy)} : {(len(sell) + len(holddown)) / len(strongbuy):.4f}")

1.0 : 0.6799


In [6]:
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data[["label"]])

test_data["labels"] = test_data["label"]
test_data = test_data.drop(labels=["filename", "length", "label"], axis=1)

In [7]:
fold_length = 5
X = train_data.drop(labels=["filename", "length", "label"], axis=1)
y = train_data["label"]

skf = StratifiedKFold(n_splits=fold_length, shuffle=True, random_state=42)
fold_dataframe = list()

for fold_number, (train, valid) in enumerate(skf.split(X, y), 1):
    X_train, X_valid = X.iloc[train], X.iloc[valid]
    y_train, y_valid = y.iloc[train], y.iloc[valid]
    
    fold_train = X_train.loc[:]
    fold_train["labels"] = y_train
    
    fold_valid = X_valid.loc[:]
    fold_valid["labels"] = y_valid
    
    fold_train.to_csv(f"./data/train_data_fold{fold_number}.csv", index=False)
    fold_valid.to_csv(f"./data/valid_data_fold{fold_number}.csv", index=False)

test_data.to_csv(f"./data/test_data.csv", index=False)

In [8]:
kb_albert_model_path = "./models/kb-albert-char-base-v2"
albert = AutoModel.from_pretrained(kb_albert_model_path)
tokenizer = AutoTokenizer.from_pretrained(kb_albert_model_path)

tokenizer.truncation_side = "left"

Some weights of the model checkpoint at ./models/kb-albert-char-base-v2 were not used when initializing AlbertModel: ['predictions.dense.weight', 'sop_classifier.classifier.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'sop_classifier.classifier.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
MAX_LEN = 512

def tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    outputs["labels"] = data["labels"]
    return outputs

dataset_list = list()
for fold_number in range(1, fold_length+1):
    train_dataset = load_dataset("csv", data_files=f"./data/train_data_fold{fold_number}.csv")["train"]
    valid_dataset = load_dataset("csv", data_files=f"./data/valid_data_fold{fold_number}.csv")["train"]

    train_dataset = train_dataset.map(tokenized_fn, remove_columns=["article"])
    valid_dataset = valid_dataset.map(tokenized_fn, remove_columns=["article"])
    
    dataset_list.append([train_dataset, valid_dataset])

test_dataset = load_dataset("csv", data_files=f"./data/test_data.csv")["train"]
test_dataset = test_dataset.map(tokenized_fn, remove_columns=["article"])

Using custom data configuration default-fdc6debd7b5f9adb


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-fdc6debd7b5f9adb/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-fdc6debd7b5f9adb/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-2de449cdcd272d92


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-2de449cdcd272d92/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-2de449cdcd272d92/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7040 [00:00<?, ?ex/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

Using custom data configuration default-ec0754f03e0c9f9e


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-ec0754f03e0c9f9e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-ec0754f03e0c9f9e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-d9179eb504127ae8


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-d9179eb504127ae8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-d9179eb504127ae8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7040 [00:00<?, ?ex/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

Using custom data configuration default-0f3ff9c690ca1eab


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-0f3ff9c690ca1eab/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-0f3ff9c690ca1eab/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-c1f87acf2f374cdc


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-c1f87acf2f374cdc/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-c1f87acf2f374cdc/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7040 [00:00<?, ?ex/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

Using custom data configuration default-92015d5eb36925a2


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-92015d5eb36925a2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-92015d5eb36925a2/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cd94f5aeae548ad8


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-cd94f5aeae548ad8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-cd94f5aeae548ad8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7040 [00:00<?, ?ex/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

Using custom data configuration default-4a04d4856deb042f


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-4a04d4856deb042f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-4a04d4856deb042f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-9ba365238ea8e24d


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-9ba365238ea8e24d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-9ba365238ea8e24d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7040 [00:00<?, ?ex/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

Using custom data configuration default-2aba08cc8e91b411


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-2aba08cc8e91b411/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-2aba08cc8e91b411/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2200 [00:00<?, ?ex/s]

In [10]:
batch_size = 16

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataloader_list = list()
for train_fold, valid_fold in dataset_list:
    train_dataloader = torch.utils.data.DataLoader(
        train_fold,
        sampler = torch.utils.data.RandomSampler(train_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )

    validation_dataloader = torch.utils.data.DataLoader(
        valid_fold,
        sampler = torch.utils.data.SequentialSampler(valid_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )
    
    dataloader_list.append([train_dataloader, validation_dataloader])

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    sampler = torch.utils.data.SequentialSampler(test_dataset),
    batch_size = batch_size,
    collate_fn = data_collator,
)

In [11]:
class ClassificationHead(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.25)
        self.out_proj = torch.nn.Linear(768, 2)
    
    def forward(self, features):
        # 보통 분류기에선 start 토큰에 분류 결과를 담음
        x = features[:, 0, :]    # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1))
        x = self.dropout(x)

        x = self.out_proj(x)
        return x

class AInalyst(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(AInalyst, self).__init__()
        self.pretrained = pretrained_model
        self.classifier = ClassificationHead()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        self.labels = labels
        logits = self.classifier(outputs["last_hidden_state"])
        # prob = torch.nn.functional.softmax(logits, dim=-1)
        
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return logits, loss
        else:
            return logits

In [12]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AInalyst(pretrained_model=albert)
model.to(device)
model = torch.nn.DataParallel(model)
isParallel = True

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [16]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

epochs = 1
for epoch in range(epochs):
    print(f"============ Epoch {epoch+1}/{epochs} ============")
    print("Training...")
    
    for fold_number, (train_dataloader, validation_dataloader) in enumerate(dataloader_list, 1):
        print(f"===== Epoch {epoch+1}/{epochs} - Fold {fold_number}/{fold_length} =====")
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            model.zero_grad()

            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()

            # if step % 1000 == 0 and not step == 0:
            #     print("step : {:>5,} of {:>5,} loss: {:.5f}".format(step, len(train_dataloader), loss.item()))

        avg_train_loss = total_train_loss / len(train_dataloader)
        print()
        print(" Average training loss: {0:.5f}".format(avg_train_loss))

        # Validation
        print()
        print("Running Validation...")

        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for step, batch in enumerate(validation_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            with torch.no_grad():
                logits, loss = model(
                    input_ids = batch_input_ids,
                    attention_mask = batch_attention_mask,
                    labels = batch_labels,
                )

                if isParallel:
                    loss = loss.mean()

                total_eval_loss += loss.item()
                logits = logits.detach().cpu().numpy()
                label_ids = batch_labels.to("cpu").numpy()
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print()
        print(" Valid Accuracy: {0:.5f}".format(avg_val_accuracy))
    
    # Test
    print(f"===== Epoch {epoch+1}/{epochs} - Test =====")
    print()
    print("Running Test...")

    model.eval()
    total_test_accuracy = 0
    total_test_loss = 0
    nb_test_steps = 0
    
    for step, batch in enumerate(test_dataloader):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        with torch.no_grad():
            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_test_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch_labels.to("cpu").numpy()
            total_test_accuracy += flat_accuracy(logits, label_ids)

    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    print()
    print(" Test Accuracy: {0:.5f}".format(avg_test_accuracy))
    print()

Training...
===== Epoch 1/1 - Fold 1/5 =====





 Average training loss: 0.41387

Running Validation...

 Valid Accuracy: 0.90170
===== Epoch 1/1 - Fold 2/5 =====

 Average training loss: 0.16961

Running Validation...

 Valid Accuracy: 0.94091
===== Epoch 1/1 - Fold 3/5 =====

 Average training loss: 0.08203

Running Validation...

 Valid Accuracy: 0.98750
===== Epoch 1/1 - Fold 4/5 =====

 Average training loss: 0.03474

Running Validation...

 Valid Accuracy: 0.99659
===== Epoch 1/1 - Fold 5/5 =====

 Average training loss: 0.02120

Running Validation...

 Valid Accuracy: 0.99545
===== Epoch 1/1 - Test =====

Running Test...

 Test Accuracy: 0.94973



## Model Save

In [17]:
torch.save(model.state_dict(), "./models/kbalbert_after_transfer_learning.pt")

## Inference

In [18]:
load_model = AInalyst(pretrained_model=albert)
load_model.to(device)
load_model = torch.nn.DataParallel(load_model)
load_model.load_state_dict(torch.load("./models/kbalbert_after_transfer_learning.pt"))

<All keys matched successfully>

In [19]:
def inference_tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    return outputs

inference_dataset = load_dataset("csv", data_files=f"./data/inference_data.csv")["train"]
inference_dataset = inference_dataset.map(inference_tokenized_fn,
                                          remove_columns=["Unnamed: 0", "company", "title", "opinion", "firm", "date", "article"])

Using custom data configuration default-68bfb0a5382dece9
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-68bfb0a5382dece9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-68bfb0a5382dece9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-d946ad765fea21f9.arrow


In [20]:
inference_dataloader = torch.utils.data.DataLoader(
    inference_dataset,
    sampler = torch.utils.data.SequentialSampler(inference_dataset),
    batch_size = 1,
    collate_fn = data_collator,
)

In [21]:
load_model.eval()

probabilities = list()
predictions = list()

for step, batch in enumerate(tqdm(inference_dataloader, desc="inference", mininterval=0.1)):
    batch_input_ids = batch["input_ids"].to(device)
    batch_attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        logits = load_model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
        )
        
        prob = torch.nn.functional.softmax(logits, dim=-1)
        predict = torch.argmax(prob, axis=1)
        
        prob = np.round(np.max(prob.detach().cpu().numpy(), axis=1) * 100, 2)
        predict = predict.detach().cpu().numpy()
        
        probabilities.append(prob[0])
        predictions.append(predict[0])

inference: 100%|██████████████████████████| 50083/50083 [19:41<00:00, 42.40it/s]


In [22]:
inference_dataframe = pd.read_csv("./data/inference_data.csv")

convert_predictions = list(map(lambda x: "매수" if x == 1 else "매도", predictions))
inference_dataframe = inference_dataframe.drop(labels="Unnamed: 0", axis=1)
inference_dataframe["predictions"] = convert_predictions
inference_dataframe["pred_rate"] = probabilities
inference_dataframe.to_csv(f"./data/convert_inference_data.csv", index=False)

In [23]:
inference_dataframe["predictions"].value_counts()

매수    37901
매도    12182
Name: predictions, dtype: int64

In [24]:
ll_num = 0
rl_num = 20

# for arti, pred, pred_rate in zip(inference_dataframe["article"][ll_num:rl_num], inference_dataframe["predictions"][ll_num:rl_num], inference_dataframe["pred_rate"][ll_num:rl_num]):
#     print(pred, pred_rate)
#     print(arti)
#     print("=" * 50)

# for arti, pred, pred_rate in zip(inference_dataframe["article"], inference_dataframe["predictions"], inference_dataframe["pred_rate"]):
#     if pred_rate < 90:
#         print(pred, pred_rate)
#         print(arti)
#         print("=" * 50)

## Inference Data들 Merge해서 확인

In [25]:
inference_dataframe = pd.read_csv("./data/inference_data.csv")
inf_origin_case1 = pd.read_csv("./data/Inference_OnlyOrigindata_1Epoch_5Fold.csv")
inf_origin_case2 = pd.read_csv("./data/Inference_OnlyOrigindata_5Epoch_5Fold.csv")
inf_agument_case1 = pd.read_csv("./data/Inference_AllArgumentationData_5Epoch_5Fold.csv")

FileNotFoundError: [Errno 2] No such file or directory: './data/Inference_AllArgumentationData_5Epoch_5Fold.csv'

In [None]:
inference_dataframe["origin_case1_predictions"] = inf_origin_case1["predictions"]
inference_dataframe["origin_case1_pred_rate"] = inf_origin_case1["pred_rate"]

inference_dataframe["origin_case2_predictions"] = inf_origin_case2["predictions"]
inference_dataframe["origin_case2_pred_rate"] = inf_origin_case2["pred_rate"]

inference_dataframe["agument_case1_predictions"] = inf_agument_case1["predictions"]
inference_dataframe["agument_case1_pred_rate"] = inf_agument_case1["pred_rate"]

# inference_dataframe.to_csv(f"./data/merge_inference_result.csv", index=False)

In [None]:
merged_inference = pd.read_csv("./data/merge_inference_result.csv")

In [None]:
want_view_data = merged_inference[["article",
                                   "origin_case1_predictions", "origin_case1_pred_rate",
                                   "origin_case2_predictions", "origin_case2_pred_rate",
                                   "agument_case1_predictions", "agument_case1_pred_rate"]]

for idx, (item_idx, item_data) in enumerate(want_view_data.iterrows()):
    if idx == 100:
        break
        
    if (item_data["origin_case1_predictions"] != item_data["origin_case2_predictions"]) or \
        (item_data["origin_case2_predictions"] != item_data["agument_case1_predictions"]):
        print("origin_case1 result : ", item_data["origin_case1_predictions"], item_data["origin_case1_pred_rate"])
        print("origin_case2 result : ", item_data["origin_case2_predictions"], item_data["origin_case2_pred_rate"])
        print("agument_case1 result : ", item_data["agument_case1_predictions"], item_data["agument_case1_pred_rate"])
        print(item_data["article"])
        print("=" * 50)