In [1]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import random
from tqdm import tqdm

2022-08-17 11:06:28.857275: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# normal data load
strongbuy = pd.read_csv("./data/strongbuy.csv")
sell = pd.read_csv("./data/sell.csv")
holddown = pd.read_csv("./data/holddown.csv")

In [3]:
# expand eata load
strongbuy_expand_google_en = pd.read_csv("./data/strongbuy_expand_google_en.csv")
strongbuy_expand_google_zhcn = pd.read_csv("./data/strongbuy_expand_google_zhcn.csv")
strongbuy_expand_pororo_en = pd.read_csv("./data/strongbuy_expand_pororo_en.csv")

sell_expand_google_en = pd.read_csv("./data/sell_expand_google_en.csv")
sell_expand_google_zhcn = pd.read_csv("./data/sell_expand_google_zhcn.csv")
sell_expand_pororo_en = pd.read_csv("./data/sell_expand_pororo_en.csv")

holddown_expand_google_en = pd.read_csv("./data/holddown_expand_google_en.csv")
holddown_expand_google_zhcn = pd.read_csv("./data/holddown_expand_google_zhcn.csv")
holddown_expand_pororo_en = pd.read_csv("./data/holddown_expand_pororo_en.csv")

In [4]:
all_sell = pd.concat([sell, holddown])
all_data = pd.concat([strongbuy, all_sell])

# all_data = pd.concat([strongbuy, strongbuy_expand_google_en, strongbuy_expand_google_zhcn, strongbuy_expand_pororo_en,
#                       sell, sell_expand_google_en, sell_expand_google_zhcn, sell_expand_pororo_en,
#                       holddown, holddown_expand_google_en, holddown_expand_google_zhcn, holddown_expand_pororo_en])
all_data.sample(frac=1).reset_index(drop=True)

Unnamed: 0,filename,article,length,label
0,strongbuy65.txt," 현대엘리베이터 투자의견 STRONG BUY를 유지하며, 목표주가를 기존 108,...",519,1
1,strongbuy50.txt,"(cid:1) 국제유가 급락 영향으로, 2014년 4분기 실적은 부진할 것으로 전망...",241,1
2,strongbuy466.txt,기타 사업장의 수익성 예상치 또한 기대이상으로 영업이익률을 17.5%까지 기록했다는...,354,1
3,strongbuy301.txt,"투자의견 STRONG BUY 유지, 목표주가 150,000원 상향 한국항공우주 투자...",539,1
4,strongbuy101.txt,LG디스플레이 (034220) - Strong Buy 17년 1분기 Preview ...,373,1
...,...,...,...,...
2745,se11_content14.txt,"삼성중공업 (010140 KS, REDUCE, TP: 12,100 원): 부진한 3...",798,0
2746,strongbuy126.txt,녹십자 (006280) - STRONG BUY Investment Point - 3...,573,1
2747,strongbuy59.txt,"부문별 영업이익의 경우, ‘기초소재부문 3,158억원(전분기 3,212억원), 전지...",325,1
2748,strongbuy396.txt,"롯데케미칼(011170) : 다시 돌아온 그대 투자의견 Strong Buy, 목표주...",769,1


In [5]:
print(f"{len(strongbuy)/len(strongbuy)} : {(len(sell) + len(holddown)) / len(strongbuy):.4f}")

1.0 : 0.6799


In [6]:
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data[["label"]])

test_data["labels"] = test_data["label"]
test_data = test_data.drop(labels=["filename", "length", "label"], axis=1)

In [7]:
fold_length = 5
X = train_data.drop(labels=["filename", "length", "label"], axis=1)
y = train_data["label"]

skf = StratifiedKFold(n_splits=fold_length, shuffle=True, random_state=42)
fold_dataframe = list()

for fold_number, (train, valid) in enumerate(skf.split(X, y), 1):
    X_train, X_valid = X.iloc[train], X.iloc[valid]
    y_train, y_valid = y.iloc[train], y.iloc[valid]
    
    fold_train = X_train.loc[:]
    fold_train["labels"] = y_train
    
    fold_valid = X_valid.loc[:]
    fold_valid["labels"] = y_valid
    
    fold_train.to_csv(f"./data/train_data_fold{fold_number}.csv", index=False)
    fold_valid.to_csv(f"./data/valid_data_fold{fold_number}.csv", index=False)

test_data.to_csv(f"./data/test_data.csv", index=False)

In [8]:
kb_albert_model_path = "./model/kb-albert-char-base-v2"
albert = AutoModel.from_pretrained(kb_albert_model_path)
tokenizer = AutoTokenizer.from_pretrained(kb_albert_model_path)

tokenizer.truncation_side = "left"

Some weights of the model checkpoint at ./model/kb-albert-char-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.weight', 'predictions.LayerNorm.weight', 'sop_classifier.classifier.weight', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.bias', 'predictions.dense.bias', 'sop_classifier.classifier.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
MAX_LEN = 512

def tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    outputs["labels"] = data["labels"]
    return outputs

dataset_list = list()
for fold_number in range(1, fold_length+1):
    train_dataset = load_dataset("csv", data_files=f"./data/train_data_fold{fold_number}.csv")["train"]
    valid_dataset = load_dataset("csv", data_files=f"./data/valid_data_fold{fold_number}.csv")["train"]

    train_dataset = train_dataset.map(tokenized_fn, remove_columns=["article"])
    valid_dataset = valid_dataset.map(tokenized_fn, remove_columns=["article"])
    
    dataset_list.append([train_dataset, valid_dataset])

test_dataset = load_dataset("csv", data_files=f"./data/test_data.csv")["train"]
test_dataset = test_dataset.map(tokenized_fn, remove_columns=["article"])

Using custom data configuration default-7fd20440fb06147e


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-7fd20440fb06147e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-7fd20440fb06147e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-7038ac638b89b583


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-7038ac638b89b583/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-7038ac638b89b583/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-c1f9d18edec93a8e


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-c1f9d18edec93a8e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-c1f9d18edec93a8e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-3c2c95d82e9ad4c4


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-3c2c95d82e9ad4c4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-3c2c95d82e9ad4c4/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-d1fffbdb0a9c391f


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-d1fffbdb0a9c391f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-d1fffbdb0a9c391f/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-8859db2e9a316acb


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-8859db2e9a316acb/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-8859db2e9a316acb/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-0fe463e97c52e7a1


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-0fe463e97c52e7a1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-0fe463e97c52e7a1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cf9f7b8a5024fd69


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-cf9f7b8a5024fd69/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-cf9f7b8a5024fd69/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-d6f7c2958b83ce76


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-d6f7c2958b83ce76/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-d6f7c2958b83ce76/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-d7e5708eb5c5cb4e


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-d7e5708eb5c5cb4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-d7e5708eb5c5cb4e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-14054ea53b9fd6a7


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-14054ea53b9fd6a7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-14054ea53b9fd6a7/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/550 [00:00<?, ?ex/s]

In [10]:
batch_size = 16

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataloader_list = list()
for train_fold, valid_fold in dataset_list:
    train_dataloader = torch.utils.data.DataLoader(
        train_fold,
        sampler = torch.utils.data.RandomSampler(train_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )

    validation_dataloader = torch.utils.data.DataLoader(
        valid_fold,
        sampler = torch.utils.data.SequentialSampler(valid_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )
    
    dataloader_list.append([train_dataloader, validation_dataloader])

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    sampler = torch.utils.data.SequentialSampler(test_dataset),
    batch_size = batch_size,
    collate_fn = data_collator,
)

In [11]:
class ClassificationHead(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.25)
        self.out_proj = torch.nn.Linear(768, 2)
    
    def forward(self, features):
        # 보통 분류기에선 start 토큰에 분류 결과를 담음
        x = features[:, 0, :]    # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1))
        x = self.dropout(x)

        x = self.out_proj(x)
        return x

class AInalyst(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(AInalyst, self).__init__()
        self.pretrained = pretrained_model
        self.classifier = ClassificationHead()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        self.labels = labels
        logits = self.classifier(outputs["last_hidden_state"])
        # prob = torch.nn.functional.softmax(logits, dim=-1)
        
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return logits, loss
        else:
            return logits

In [12]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AInalyst(pretrained_model=albert)
model.to(device)
model = torch.nn.DataParallel(model)
isParallel = True

In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [16]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

epochs = 3
for epoch in range(epochs):
    print(f"============ Epoch {epoch+1}/{epochs} ============")
    print("Training...")
    
    for fold_number, (train_dataloader, validation_dataloader) in enumerate(dataloader_list, 1):
        print(f"===== Epoch {epoch+1}/{epochs} - Fold {fold_number}/{fold_length} =====")
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            model.zero_grad()

            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()

            # if step % 1000 == 0 and not step == 0:
            #     print("step : {:>5,} of {:>5,} loss: {:.5f}".format(step, len(train_dataloader), loss.item()))

        avg_train_loss = total_train_loss / len(train_dataloader)
        print()
        print(" Average training loss: {0:.5f}".format(avg_train_loss))

        # Validation
        print()
        print("Running Validation...")

        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for step, batch in enumerate(validation_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            with torch.no_grad():
                logits, loss = model(
                    input_ids = batch_input_ids,
                    attention_mask = batch_attention_mask,
                    labels = batch_labels,
                )

                if isParallel:
                    loss = loss.mean()

                total_eval_loss += loss.item()
                logits = logits.detach().cpu().numpy()
                label_ids = batch_labels.to("cpu").numpy()
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print()
        print(" Valid Accuracy: {0:.5f}".format(avg_val_accuracy))
    
    # Test
    print(f"===== Epoch {epoch+1}/{epochs} - Test =====")
    print()
    print("Running Test...")

    model.eval()
    total_test_accuracy = 0
    total_test_loss = 0
    nb_test_steps = 0
    
    for step, batch in enumerate(test_dataloader):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        with torch.no_grad():
            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_test_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch_labels.to("cpu").numpy()
            total_test_accuracy += flat_accuracy(logits, label_ids)

    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    print()
    print(" Test Accuracy: {0:.5f}".format(avg_test_accuracy))
    print()

Training...
===== Epoch 1/5 - Fold 1/5 =====





 Average training loss: 0.60048

Running Validation...

 Valid Accuracy: 0.78125
===== Epoch 1/5 - Fold 2/5 =====

 Average training loss: 0.35368

Running Validation...

 Valid Accuracy: 0.89062
===== Epoch 1/5 - Fold 3/5 =====

 Average training loss: 0.18220

Running Validation...

 Valid Accuracy: 0.95312
===== Epoch 1/5 - Fold 4/5 =====

 Average training loss: 0.09401

Running Validation...

 Valid Accuracy: 0.98214
===== Epoch 1/5 - Fold 5/5 =====

 Average training loss: 0.04777

Running Validation...

 Valid Accuracy: 0.99554
===== Epoch 1/5 - Test =====

Running Test...

 Test Accuracy: 0.92024

Training...
===== Epoch 2/5 - Fold 1/5 =====

 Average training loss: 0.01887

Running Validation...

 Valid Accuracy: 0.99777
===== Epoch 2/5 - Fold 2/5 =====

 Average training loss: 0.00897

Running Validation...

 Valid Accuracy: 1.00000
===== Epoch 2/5 - Fold 3/5 =====

 Average training loss: 0.00490

Running Validation...

 Valid Accuracy: 1.00000
===== Epoch 2/5 - Fold 4/5 ==

KeyboardInterrupt: 

## Model Save

In [None]:
torch.save(model.state_dict(), "./model/kbalbert_epoch5_fold5.pt")

## Inference

In [None]:
load_model = AInalyst(pretrained_model=albert)
load_model.to(device)
load_model = torch.nn.DataParallel(load_model)
load_model.load_state_dict(torch.load("./model/kbalbert_epoch5_fold5.pt"))

In [None]:
def inference_tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    return outputs

inference_dataset = load_dataset("csv", data_files=f"./data/inference_data.csv")["train"]
inference_dataset = inference_dataset.map(inference_tokenized_fn,
                                          remove_columns=["Unnamed: 0", "company", "title", "opinion", "firm", "date", "article"])

In [None]:
inference_dataloader = torch.utils.data.DataLoader(
    inference_dataset,
    sampler = torch.utils.data.SequentialSampler(inference_dataset),
    batch_size = 1,
    collate_fn = data_collator,
)

In [None]:
load_model.eval()

probabilities = list()
predictions = list()

for step, batch in enumerate(tqdm(inference_dataloader, desc="inference", mininterval=0.1)):
    batch_input_ids = batch["input_ids"].to(device)
    batch_attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        logits = load_model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
        )
        
        prob = torch.nn.functional.softmax(logits, dim=-1)
        predict = torch.argmax(prob, axis=1)
        
        prob = np.round(np.max(prob.detach().cpu().numpy(), axis=1) * 100, 2)
        predict = predict.detach().cpu().numpy()
        
        probabilities.append(prob[0])
        predictions.append(predict[0])

In [None]:
inference_dataframe = pd.read_csv("./data/inference_data.csv")

convert_predictions = list(map(lambda x: "매수" if x == 1 else "매도", predictions))
inference_dataframe = inference_dataframe.drop(labels="Unnamed: 0", axis=1)
inference_dataframe["predictions"] = convert_predictions
inference_dataframe["pred_rate"] = probabilities
inference_dataframe.to_csv(f"./data/convert_inference_data.csv", index=False)

In [None]:
inference_dataframe["predictions"].value_counts()

In [None]:
ll_num = 0
rl_num = 20

# for arti, pred, pred_rate in zip(inference_dataframe["article"][ll_num:rl_num], inference_dataframe["predictions"][ll_num:rl_num], inference_dataframe["pred_rate"][ll_num:rl_num]):
#     print(pred, pred_rate)
#     print(arti)
#     print("=" * 50)

# for arti, pred, pred_rate in zip(inference_dataframe["article"], inference_dataframe["predictions"], inference_dataframe["pred_rate"]):
#     if pred_rate < 90:
#         print(pred, pred_rate)
#         print(arti)
#         print("=" * 50)

## Inference Data들 Merge해서 확인

In [27]:
inference_dataframe = pd.read_csv("./data/inference_data.csv")
inf_origin_case1 = pd.read_csv("./data/Inference_OnlyOrigindata_1Epoch_5Fold.csv")
inf_origin_case2 = pd.read_csv("./data/Inference_OnlyOrigindata_5Epoch_5Fold.csv")
inf_agument_case1 = pd.read_csv("./data/Inference_AllArgumentationData_5Epoch_5Fold.csv")

In [28]:
inference_dataframe["origin_case1_predictions"] = inf_origin_case1["predictions"]
inference_dataframe["origin_case1_pred_rate"] = inf_origin_case1["pred_rate"]

inference_dataframe["origin_case2_predictions"] = inf_origin_case2["predictions"]
inference_dataframe["origin_case2_pred_rate"] = inf_origin_case2["pred_rate"]

inference_dataframe["agument_case1_predictions"] = inf_agument_case1["predictions"]
inference_dataframe["agument_case1_pred_rate"] = inf_agument_case1["pred_rate"]

inference_dataframe.to_csv(f"./data/merge_inference_result.csv", index=False)

In [30]:
merged_inference = pd.read_csv("./data/merge_inference_result.csv")

In [59]:
want_view_data = merged_inference[["article",
                                   "origin_case1_predictions", "origin_case1_pred_rate",
                                   "origin_case2_predictions", "origin_case2_pred_rate",
                                   "agument_case1_predictions", "agument_case1_pred_rate"]]

for idx, (item_idx, item_data) in enumerate(want_view_data.iterrows()):
    if idx == 100:
        break
        
    if (item_data["origin_case1_predictions"] != item_data["origin_case2_predictions"]) or \
        (item_data["origin_case2_predictions"] != item_data["agument_case1_predictions"]):
        print("origin_case1 result : ", item_data["origin_case1_predictions"], item_data["origin_case1_pred_rate"])
        print("origin_case2 result : ", item_data["origin_case2_predictions"], item_data["origin_case2_pred_rate"])
        print("agument_case1 result : ", item_data["agument_case1_predictions"], item_data["agument_case1_pred_rate"])
        print(item_data["article"])
        print("=" * 50)

origin_case1 result :  매도 86.0
origin_case2 result :  매도 90.0
agument_case1 result :  매수 99.91


2Q22 Review : 높아진 컨센서스 상회LG이노텍의 2분기 매출액 3조 7,026억원(YoY+57.2%, QoQ-31.0%), 영업이익 2,899억원(YoY+83.7%, QoQ-21.0%) 최근 높아진 컨센서스 영업이익(매출액 3조 2,783억원, 영업이익2,545억원)을 상회하는 호실적을 발표. 광학솔루션사업부는 우호적인 환율속에 고객사의 판매량 호조와 높은 고부가 제품 비중효과가 작용. 기판소재사 업부는 1Q22부진했던 디스플레이 제품군의 양호한 실적과 반도체 패키지 기판의 견조한 수요와 생산능력 확대 효과 반영. 전장부품사업부는 차량용 통신모듈 및 모터가 증가 했으나 원자재 가격상승영향으로 수익성 개선이 지연.

origin_case1 result :  매수 99.0
origin_case2 result :  매수 97.0
agument_case1 result :  매도 97.92


투자의견 ‘매수’ 및 목표주가 150,000원 유지. 최대 실적 다시 경신삼성물산에 대해 투자의견 ‘매수’ 및 목표주가 150,000원 유지. 2Q22 실적은 글로벌 원자재 가격 상승•공급망 훼손 등 경영환경 악화 불구, 경영체질 개선 및 사업 경쟁력 강화 노력에 힘입어 외형과 수익성 모두 크게 개선, 역대 최대 영업이익 경신. 2분기 연속 실적 서프라이즈로 목표주가 상향요인 있으나, 여전한 목표 주가 괴리율과 최근 글로벌 경기 침체 우려에 따른 자회사 지분 가치 하락 리스크를 감안 목표주가 유지. ① 1H22 건설부문 신규 수주는 계열사 신규 투자, 베트남 복합 발전 등 8.6조원으로 연간목표 11조원 대비 73.5% 달성. ② 매출화 빠른 하이테크 공사 진행률 증가, 상사를 포함한 사업 전부문 안정적 이익 체력 구축으로 합병 이후 최초로 연간 영업이익 2조원 달성 전망. 매수 추천.

origin