In [38]:
import torch
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from sklearn.model_selection import train_test_split, StratifiedKFold
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import random
from tqdm import tqdm

In [2]:
strongbuy = pd.read_csv("./data/strongbuy.csv")
sell = pd.read_csv("./data/sell.csv")
holddown = pd.read_csv("./data/holddown.csv")

In [3]:
all_sell = pd.concat([sell, holddown])
all_data = pd.concat([strongbuy, all_sell])
all_data.sample(frac=1).reset_index(drop=True)

Unnamed: 0,filename,article,length,label
0,strongbuy176.txt,"신규설비 PO,PP 출하 개시! 2018년 8월 말 ~ 9월 초, PO/PP 신규제...",490,1
1,strongbuy387.txt,"16년 신규 수주금액 1,900억원 예상 16년 디스플레이부분의 신규 수주금액으로 ...",711,1
2,strongbuy105.txt,"▶ Investment Point 1. 1 분기 영업이익 1,780 억원(OPM 6...",1225,1
3,strongbuy454.txt,"횟감참치 가격 반등, 유가하락으로 인한 원가 절감 등에 힘입어 매출/이익 모두 성장...",434,1
4,strongbuy199.txt,- 미국 ECC 2차 증설의 의미: 장기적으로 세계 에틸렌 수익성이 좋다는 반증 D...,377,1
...,...,...,...,...
2745,holddown32.txt,엑스코프리 판매 증가 속도 높아져 1 분기 마일스톤으로 흑자전환 동사의 1 분기 매...,226,0
2746,holddown217.txt,한국타이어앤테크놀로지 (161390) 4Q19 Review: 안개속 방향성과 주주환...,292,0
2747,holddown147.txt,1분기 호실적에도 기존 HOLD의견을 유지. 이는 해당 실적이 올해 이익전망 혹은 ...,405,0
2748,holddown50.txt,"유상증자로 주주가치 희석 투자의견 시장수익률 유지, 목표주가 3,500원으로 12....",631,0


In [4]:
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42, stratify=all_data[["label"]])

test_data["labels"] = test_data["label"]
test_data = test_data.drop(labels=["filename", "length", "label"], axis=1)

In [5]:
fold_length = 5
X = train_data.drop(labels=["filename", "length", "label"], axis=1)
y = train_data["label"]

skf = StratifiedKFold(n_splits=fold_length, shuffle=True, random_state=42)
fold_dataframe = list()

for fold_number, (train, valid) in enumerate(skf.split(X, y), 1):
    X_train, X_valid = X.iloc[train], X.iloc[valid]
    y_train, y_valid = y.iloc[train], y.iloc[valid]
    
    fold_train = X_train.loc[:]
    fold_train["labels"] = y_train
    
    fold_valid = X_valid.loc[:]
    fold_valid["labels"] = y_valid
    
    fold_train.to_csv(f"./data/train_data_fold{fold_number}.csv", index=False)
    fold_valid.to_csv(f"./data/valid_data_fold{fold_number}.csv", index=False)

test_data.to_csv(f"./data/test_data.csv", index=False)

In [6]:
kb_albert_model_path = "./model/kb-albert-char-base-v2"
albert = AutoModel.from_pretrained(kb_albert_model_path)
tokenizer = AutoTokenizer.from_pretrained(kb_albert_model_path)

tokenizer.truncation_side = "left"

Some weights of the model checkpoint at ./model/kb-albert-char-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.weight', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'sop_classifier.classifier.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
MAX_LEN = 512

def tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    outputs["labels"] = data["labels"]
    return outputs

dataset_list = list()
for fold_number in range(1, fold_length+1):
    train_dataset = load_dataset("csv", data_files=f"./data/train_data_fold{fold_number}.csv")["train"]
    valid_dataset = load_dataset("csv", data_files=f"./data/valid_data_fold{fold_number}.csv")["train"]

    train_dataset = train_dataset.map(tokenized_fn, remove_columns=["article"])
    valid_dataset = valid_dataset.map(tokenized_fn, remove_columns=["article"])
    
    dataset_list.append([train_dataset, valid_dataset])

test_dataset = load_dataset("csv", data_files=f"./data/test_data.csv")["train"]
test_dataset = test_dataset.map(tokenized_fn, remove_columns=["article"])

Using custom data configuration default-45fad79363d31e69


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-45fad79363d31e69/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-45fad79363d31e69/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-098db09835a40860


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-098db09835a40860/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-098db09835a40860/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-f921ea277fbe6e80


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-f921ea277fbe6e80/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-f921ea277fbe6e80/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-967dc1f774a57ea8


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-967dc1f774a57ea8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-967dc1f774a57ea8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-025afa39e23e8592


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-025afa39e23e8592/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-025afa39e23e8592/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-67aff40025cf923d


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-67aff40025cf923d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-67aff40025cf923d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-daa17a2c7ea3fa0b


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-daa17a2c7ea3fa0b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-daa17a2c7ea3fa0b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-cc2c34594246d9a5


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-cc2c34594246d9a5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-cc2c34594246d9a5/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-3397bf8884bcce67


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-3397bf8884bcce67/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-3397bf8884bcce67/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-c3b3e30ff96a66bf


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-c3b3e30ff96a66bf/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-c3b3e30ff96a66bf/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1760 [00:00<?, ?ex/s]

  0%|          | 0/440 [00:00<?, ?ex/s]

Using custom data configuration default-443f1b8ad4a48e7b


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-443f1b8ad4a48e7b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-443f1b8ad4a48e7b/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/550 [00:00<?, ?ex/s]

In [8]:
batch_size = 4

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

dataloader_list = list()
for train_fold, valid_fold in dataset_list:
    train_dataloader = torch.utils.data.DataLoader(
        train_fold,
        sampler = torch.utils.data.RandomSampler(train_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )

    validation_dataloader = torch.utils.data.DataLoader(
        valid_fold,
        sampler = torch.utils.data.SequentialSampler(valid_fold),
        batch_size = batch_size,
        collate_fn = data_collator,
    )
    
    dataloader_list.append([train_dataloader, validation_dataloader])

test_dataloader = torch.utils.data.DataLoader(
    test_dataset,
    sampler = torch.utils.data.SequentialSampler(test_dataset),
    batch_size = batch_size,
    collate_fn = data_collator,
)

In [9]:
class ClassificationHead(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.dropout = torch.nn.Dropout(0.25)
        self.out_proj = torch.nn.Linear(768, 2)
    
    def forward(self, features):
        # 보통 분류기에선 start 토큰에 분류 결과를 담음
        x = features[:, 0, :]    # take <s> token (equiv. to [CLS])
        x = x.reshape(-1, x.size(-1))
        x = self.dropout(x)

        x = self.out_proj(x)
        return x

class AInalyst(torch.nn.Module):
    def __init__(self, pretrained_model):
        super(AInalyst, self).__init__()
        self.pretrained = pretrained_model
        self.classifier = ClassificationHead()
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.pretrained(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # labels=labels
        )
        self.labels = labels
        logits = self.classifier(outputs["last_hidden_state"])
        # prob = torch.nn.functional.softmax(logits, dim=-1)
        
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
            return logits, loss
        else:
            return logits

In [10]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AInalyst(pretrained_model=albert)
model.to(device)
model = torch.nn.DataParallel(model)
isParallel = True

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [14]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

epochs = 1
for epoch in range(epochs):
    print(f"============ Epoch {epoch+1}/{epochs} ============")
    print("Training...")
    
    for fold_number, (train_dataloader, validation_dataloader) in enumerate(dataloader_list, 1):
        print(f"===== Epoch {epoch+1}/{epochs} - Fold {fold_number}/{fold_length} =====")
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            model.zero_grad()

            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_train_loss += loss.item()
            loss.backward()
            optimizer.step()

            # if step % 1000 == 0 and not step == 0:
            #     print("step : {:>5,} of {:>5,} loss: {:.5f}".format(step, len(train_dataloader), loss.item()))

        avg_train_loss = total_train_loss / len(train_dataloader)
        print()
        print(" Average training loss: {0:.5f}".format(avg_train_loss))

        # Validation
        print()
        print("Running Validation...")

        model.eval()
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for step, batch in enumerate(validation_dataloader):
            batch_input_ids = batch["input_ids"].to(device)
            batch_attention_mask = batch["attention_mask"].to(device)
            batch_labels = batch["labels"].to(device)

            with torch.no_grad():
                logits, loss = model(
                    input_ids = batch_input_ids,
                    attention_mask = batch_attention_mask,
                    labels = batch_labels,
                )

                if isParallel:
                    loss = loss.mean()

                total_eval_loss += loss.item()
                logits = logits.detach().cpu().numpy()
                label_ids = batch_labels.to("cpu").numpy()
                total_eval_accuracy += flat_accuracy(logits, label_ids)

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print()
        print(" Valid Accuracy: {0:.5f}".format(avg_val_accuracy))
    
    # Test
    print(f"===== Epoch {epoch+1}/{epochs} - Test =====")
    print()
    print("Running Test...")

    model.eval()
    total_test_accuracy = 0
    total_test_loss = 0
    nb_test_steps = 0
    
    for step, batch in enumerate(test_dataloader):
        batch_input_ids = batch["input_ids"].to(device)
        batch_attention_mask = batch["attention_mask"].to(device)
        batch_labels = batch["labels"].to(device)

        with torch.no_grad():
            logits, loss = model(
                input_ids = batch_input_ids,
                attention_mask = batch_attention_mask,
                labels = batch_labels,
            )

            if isParallel:
                loss = loss.mean()

            total_test_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = batch_labels.to("cpu").numpy()
            total_test_accuracy += flat_accuracy(logits, label_ids)

    avg_test_accuracy = total_test_accuracy / len(test_dataloader)
    print(" Test Accuracy: {0:.5f}".format(avg_test_accuracy))
    print()

Training...
===== Epoch 1/1 - Fold 1/5 =====





 Average training loss: 0.47363

Running Validation...

 Valid Accuracy: 0.86136
===== Epoch 1/1 - Fold 2/5 =====

 Average training loss: 0.20610

Running Validation...

 Valid Accuracy: 0.93864
===== Epoch 1/1 - Fold 3/5 =====

 Average training loss: 0.10003

Running Validation...

 Valid Accuracy: 0.99318
===== Epoch 1/1 - Fold 4/5 =====

 Average training loss: 0.03394

Running Validation...

 Valid Accuracy: 0.99318
===== Epoch 1/1 - Fold 5/5 =====

 Average training loss: 0.05380

Running Validation...

 Valid Accuracy: 0.98864
===== Epoch 1/1 - Test =====

Running Test...
 Test Accuracy: 0.94203



## Model Save

In [15]:
# torch.save(model.state_dict(), "./model/kbalbert_epoch1_fold5.pt")

## Inference

In [16]:
load_model = AInalyst(pretrained_model=albert)
load_model.to(device)
load_model = torch.nn.DataParallel(load_model)
load_model.load_state_dict(torch.load("./model/kbalbert_epoch5_fold5.pt"))

<All keys matched successfully>

In [17]:
def inference_tokenized_fn(data):
    outputs = tokenizer(data["article"], padding="max_length", max_length=MAX_LEN, truncation=True)
    return outputs

inference_dataset = load_dataset("csv", data_files=f"./data/inference_data.csv")["train"]
inference_dataset = inference_dataset.map(inference_tokenized_fn,
                                          remove_columns=["Unnamed: 0", "company", "title", "opinion", "firm", "date", "article"])

Using custom data configuration default-68bfb0a5382dece9
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-68bfb0a5382dece9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/piai/.cache/huggingface/datasets/csv/default-68bfb0a5382dece9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-0c67678741f0cfd7.arrow


In [68]:
inference_dataloader = torch.utils.data.DataLoader(
    inference_dataset,
    sampler = torch.utils.data.SequentialSampler(inference_dataset),
    batch_size = 1,
    collate_fn = data_collator,
)

In [70]:
load_model.eval()

probabilities = list()
predictions = list()

for step, batch in enumerate(tqdm(inference_dataloader, desc="inference", mininterval=0.1)):
    batch_input_ids = batch["input_ids"].to(device)
    batch_attention_mask = batch["attention_mask"].to(device)

    with torch.no_grad():
        logits = load_model(
            input_ids = batch_input_ids,
            attention_mask = batch_attention_mask,
        )
        
        prob = torch.nn.functional.softmax(logits, dim=-1)
        predict = torch.argmax(prob, axis=1)
        
        prob = np.trunc(np.max(prob.detach().cpu().numpy(), axis=1) * 100)
        predict = predict.detach().cpu().numpy()
        
        probabilities.append(prob[0])
        predictions.append(predict[0])

inference: 100%|██████████████████████████| 50083/50083 [20:16<00:00, 41.17it/s]


In [78]:
inference_dataframe = pd.read_csv("./data/inference_data.csv")

convert_predictions = list(map(lambda x: "매수" if x == 1 else "매도", predictions))
inference_dataframe = inference_dataframe.drop(labels="Unnamed: 0", axis=1)
inference_dataframe["predictions"] = convert_predictions
inference_dataframe["pred_rate"] = probabilities
inference_dataframe.to_csv(f"./data/convert_inference_data.csv", index=False)