In [None]:
#구글드라이브 연동
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# gpu 켜기
import torch
device = torch.device("cuda:0")

In [None]:
# 저장 경로 미리 지정
path = '/content/drive/MyDrive/nlp_c/'

# **로그파일 연동 시키기**

In [None]:
import logging

def make_logger(name=None):
    #1 logger instance를 만든다.
    logger = logging.getLogger(name)

    #2 logger의 level을 가장 낮은 수준인 DEBUG로 설정해둔다.
    logger.setLevel(logging.DEBUG)

    #3 formatter 지정
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    
    #4 handler instance 생성
    console = logging.StreamHandler()
    file_handler = logging.FileHandler(filename=path + "logs/correct_final.log",
                                       encoding = 'utf-8')
    
    #5 handler 별로 다른 level 설정
    console.setLevel(logging.INFO)
    file_handler.setLevel(logging.DEBUG)

    #6 handler 출력 format 지정
    console.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    #7 logger에 handler 추가
    logger.addHandler(console)
    logger.addHandler(file_handler)

    return logger

In [None]:
logger = make_logger()

logger.debug("debug logging")
logger.info("info logging")
logger.warning("warning logging")
logger.error("error logging")
logger.critical("critical logging")

# **필요한 환경 다운 및 구축**

## 학습모델 패키지 다운 및 구축(KoBERT)

In [None]:
#깃허브에서 KoBERT 파일 로드
!pip install ipywidgets  # for vscode
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
# 필요한 모듈 로딩
import pandas as pd
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm.notebook import tqdm
from tqdm import tqdm_notebook

#kobert
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

#transformers
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

In [None]:
#BERT 모델, Vocabulary 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

# **맞춤법 검사한 데이터 불러온 뒤 전처리(EDA 방법 포함)**

In [None]:
# 맞춤법 검사 완료한 train 파일 불러오기
train_d = pd.read_csv(path + 'final_data/' + 'correct_train_fin.csv', encoding = 'utf-8-sig')
print(len(train_d))

In [None]:
# index 변경 -> 아래 모델 생성에서 index 오류 발생하여 0부터 re-indexing
# predict 값 추출하고, 다시 원래 y 값으로 변환해주는 방식으로 해야할 듯
# 소분류 dictionary
y_dict = pd.DataFrame({'origin_y' : train_d['y'].unique()}).sort_values(by = 'origin_y')
y_dict['y'] = np.arange(0, len(y_dict))
y_dict = y_dict.astype('str')
s_dict0 = y_dict.set_index('origin_y').to_dict()['y'] # 처음 y값을 모델 train 을 위해 re-indexing
s_dict1 = y_dict.set_index('y').to_dict()['origin_y'] # 뒤에 예측값 다시 y 값으로 return 할 때 사용

In [None]:
# kobert 모델 학습을 위해 reindexing 한 dictionary 저장 -> 후에 모델 예측값 도출 후, 기존 y값으로 되돌리기 위함
import pickle
with open(path+ 'final_data/' + 's_dictionary', 'wb') as f:
    pickle.dump(s_dict1, f)

In [None]:
train_d['y_s'] = train_d['y'].astype('str')
train_d['label_s'] = train_d['y_s'].map(s_dict0)

## EDA 부분

In [None]:
# text augmentation
# pip install -U nltk
import nltk; 
nltk.download('omw-1.4');
# nltk.download('wordnet') # 영문 버전

In [None]:
# eda 폴더 생성
% cd /content/drive/MyDrive/nlp_c
# !git clone https://github.com/jasonwei20/eda_nlp
# !git clone https://github.com/catSirup/KorEDA
# eda는 eda_nlp/code 폴더에, wordnet.pickle 은 eda_nlp 폴더로 이동시키고, 진행
% cd eda_nlp/
# 추가적으로 augment.py 64번째 항에 eda -> EDA로 변경해야 실행됨

In [None]:
s_class_n = pd.DataFrame(train_d['y_s'].value_counts().sort_values())
# s_class_n.to_csv(path + 'testset_class.csv', index=False, encoding='EUC-KR')
s_class = s_class_n[s_class_n['y_s'] < 500].index.tolist()
len(s_class)

In [None]:
# n수가 부족한 class aug_num 차등으로 증강(적은 순서대로 20, 10 ,5) -> 상대적으로 부족한 클래스 데이터 비율이 더 높아지는 것을 조금이라도 방지하고자 함
s_class1 = s_class[:30]
s_class2 = s_class[30:60]
s_class3 = s_class[60:]

In [None]:
few_d1 = train_d[train_d['y_s'].isin(s_class1)]
few_d2 = train_d[train_d['y_s'].isin(s_class2)]
few_d3 = train_d[train_d['y_s'].isin(s_class3)]

In [None]:
# n이 100개 이하인 클래스 뽑아서 augmentation 가능한 파일 형태로 만들어주기
txt_aug_list = [str(a) + '\t' + str(b) for a, b in zip(few_d1['label_s'], few_d1['clean_done'])]
with open(path + 'final_data/' + 'text_aug_1.txt', 'w') as f:
  f.write('\n'.join(txt_aug_list) + '\n')

txt_aug_list = [str(a) + '\t' + str(b) for a, b in zip(few_d2['label_s'], few_d2['clean_done'])]
with open(path + 'final_data/' + 'text_aug_2.txt', 'w') as f:
  f.write('\n'.join(txt_aug_list) + '\n')

txt_aug_list = [str(a) + '\t' + str(b) for a, b in zip(few_d3['label_s'], few_d3['clean_done'])]
with open(path + 'final_data/' + 'text_aug_3.txt', 'w') as f:
  f.write('\n'.join(txt_aug_list) + '\n')

# input file 형식 -> txt 파일 내 한 행 당 label + \t + text 형태로 들어간 파일 
# SR: Synonym Replacement, 특정 단어를 유의어로 교체
# RI: Random Insertion, 임의의 단어를 삽입
# RS: Random Swap, 문장 내 임의의 두 단어의 위치를 바꿈
# RD: Random Deletion: 임의의 단어를 삭제
!python code/augment.py --input=/content/drive/MyDrive/nlp_c/final_data/text_aug_1.txt --output=/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_1.txt --num_aug=20 --alpha_sr=0.1 --alpha_rd=0.2 --alpha_ri=0.1 --alpha_rs=0.0
!python code/augment.py --input=/content/drive/MyDrive/nlp_c/final_data/text_aug_2.txt --output=/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_2.txt --num_aug=10 --alpha_sr=0.1 --alpha_rd=0.2 --alpha_ri=0.1 --alpha_rs=0.0
!python code/augment.py --input=/content/drive/MyDrive/nlp_c/final_data/text_aug_3.txt --output=/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_3.txt --num_aug=5 --alpha_sr=0.1 --alpha_rd=0.2 --alpha_ri=0.1 --alpha_rs=0.0

In [None]:
# augmentation 완료한 데이터 불러와서 기존데이터셋에 붙여주기(augmentation 대상 데이터는 삭제)
with open('/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_1.txt', "r") as file:
  strings = file.readlines()
aug_d1 = pd.DataFrame([x.split('\n')[0].split('\t') for x in strings])
aug_d1.columns = ['label_s', 'clean_done']

with open('/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_2.txt', "r") as file:
  strings = file.readlines()
aug_d2 = pd.DataFrame([x.split('\n')[0].split('\t') for x in strings])
aug_d2.columns = ['label_s', 'clean_done']

with open('/content/drive/MyDrive/nlp_c/final_data/test_aug_eda_3.txt', "r") as file:
  strings = file.readlines()
aug_d3 = pd.DataFrame([x.split('\n')[0].split('\t') for x in strings])
aug_d3.columns = ['label_s', 'clean_done']

train_d = train_d[train_d['y_s'].isin(s_class)==False]
sample_d = pd.concat([train_d[['label_s', 'clean_done']], aug_d1, aug_d2, aug_d3], axis = 0, ignore_index = True)

In [None]:
sample_d['len'] = sample_d['clean_done'].astype(str).apply(len)

In [None]:
i = 798000
sample_d[i:i+50]

# **처리한 데이터 kobert 모델에 학습**

In [None]:
# train & test set 나누기
from sklearn.model_selection import train_test_split
# dataset_train, dataset_test = train_test_split(train_d, test_size=0.2, shuffle=True, random_state=30)
# dataset_test.to_csv('/content/drive/MyDrive/nlp_c/testset.csv', index=False, encoding = 'EUC-KR')

# stratify 를 target으로 지정해 비율을 맞춤으로써, 성능향상 가능
# but, 현재 target 변수 class 비율의 불균형으로 오류 발생(1,2 개짜리 class 다수 존재)
dataset_train, dataset_test, y_train, y_test = train_test_split(sample_d['clean_done'],
                               sample_d['label_s'], random_state=132, stratify=sample_d['label_s']) 
# 모델 검증용 미리 뽑아놓기
dataset_test.to_csv(path + 'final_data/' + 'testset.csv', index=False, encoding = 'utf-8-sig')
y_test.to_csv(path + 'final_data/' + 'testset_y.csv', index=False, encoding = 'utf-8-sig')

In [None]:
dataset_train = [[str(a), str(b)] for a, b in zip(dataset_train, y_train)]
dataset_test = [[str(a), str(b)] for a, b in zip(dataset_test, y_test)]

In [None]:
print(len(dataset_train))
print(len(dataset_test))

In [None]:
print(dataset_train[:10])
print(dataset_test[:10])

In [None]:
# KoBERT 입력 데이터로 만들기
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
# Setting parameters
# 이건 나중에 최적화 값 찾아봐야할 듯
max_len = 64
batch_size = 128
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5 # 0.0001 # 

# 토큰화 실행
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, pad = True, pair = False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, pad = True, pair = False)

In [None]:
sentence = dataset_train[30][0] 
print(sentence)
print(tok(sentence))

# 토큰화 패딩 처리 후 결과값 
print(data_train[30])

In [None]:
# torch 형식의 dataset 생성
# num_worker 은 gpu 활정화 정도, 5로 하니 오히려 과부화가 걸려 4로 조정
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=4)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=4)

In [None]:
# 클래스 수 조정
print(len(y_dict))

In [None]:
# KoBERT 학습모델 만들기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=len(y_dict),   ##클래스 수 조정해줘야함##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
# GPU 실행 오류 나면 사용
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
# pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
highest_acc = 0
patience = 0

# 최종 모델 학습시키기
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    
    model.eval()

    for test_batch_id, (test_token_ids, test_valid_length, test_segment_ids, test_label) in enumerate(tqdm_notebook(test_dataloader)):
        test_token_ids = test_token_ids.long().to(device)
        test_segment_ids = test_segment_ids.long().to(device)
        test_valid_length= test_valid_length
        test_label = test_label.long().to(device)
        test_out = model(token_ids, valid_length, segment_ids)
        test_loss = loss_fn(out, label)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (test_batch_id+1)))

    if test_acc > highest_acc:
        torch.save({
            'epoch': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': test_loss,
            }, path + 'final_data/' + 'correct_model_fin.pt')
        patience = 0
    else:
        print("test acc did not improved. best:{} current:{}".format(highest_acc, test_acc))
        patience += 1
        if patience > 5:
            break
    print('current patience: {}'.format(patience))
    print("************************************************************************************")

In [None]:
# # 학습한 모델 pickle 형태로 저장

# import pickle
# # path = '/content/drive/MyDrive/nlp_c/'

# with open(path+'model_trial_fin_noclean.pickle', 'wb') as f:
#     pickle.dump(model, f)

# **코드북으로 제출형식 만들기**

## 한국표준산업분류(10차)_국문 자료 이용해서 코드 북 만들기

In [None]:
import numpy as np 
import pandas as pd

In [None]:
path = '/content/drive/MyDrive/nlp_c/'
code_book = pd.read_excel(path + '한국표준산업분류(10차)_국문.xlsx', header = 1)

In [None]:
code_book = code_book.dropna(subset = ['소분류(232)'])
code_book[:10]

In [None]:
code = code_book[['대분류(21)', '중분류(77)', '소분류(232)', 'Unnamed: 5']][1:].reset_index(drop=True)

In [None]:
def na_to_code(data):
  data_l = []
  temp = data[0]

  for i in range(0, len(data)):
    if pd.isna(data[i]):
      data[i] =  temp
    else:
      temp = data[i] 
    data_l.append(temp)
  return data_l

In [None]:
big = na_to_code(code.iloc[:,0].tolist())
middle = na_to_code(code.iloc[:,1].tolist())
small = na_to_code(code.iloc[:,2].tolist())

In [None]:
code_b = pd.DataFrame(zip(big,middle,small), columns = ['big', 'middle', 'small'])
code_b['y'] = code_b['small'].astype('int64')
code_b['name'] = code['Unnamed: 5']
code_b
code_b.to_excel(path + 'codebook_dict.xlsx', index=False, encoding = 'EUC-KR')

In [None]:
code_b = pd.read_excel(path + 'codebook_dict.xlsx', dtype = {'big': str, 'middle': str, 'small': str})

In [None]:
code_b = code_b.iloc[:,:-1] # name은 참고용이므로 제외
dict_fin = code_b.set_index('y').T.to_dict('list') # 소분류값을 key 로 한 dictionary

## 학습한 모델 불러오기 및 환경 구축

In [None]:
# # train 할 때, 메모리를 많이 사용하여, 비우기
# import gc
# gc.collect()
# del model
# torch.cuda.empty_cache()

In [None]:
# test 하기전 모델 기본 값 불러오기

# KoBERT 입력 데이터로 만들기
# BERT 모델에 들어가기 위한 dataset을 만들어주는 클래스
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

# 토큰화 실행
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

# KoBERT 학습모델 만들기
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=225,   ##클래스 수 조정해줘야함##
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
# import pickle
# # 학습한 model 열기

# with open(path+'model_trial_fin_noclean.pickle', 'rb') as f:
#     model = pickle.load(f)

In [None]:
# Setting parameters
# 이건 나중에 최적화 값 찾아봐야할 듯
max_len = 64
batch_size = 128
warmup_ratio = 0.1
num_epochs = 6
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5 # 0.0001 # 

In [None]:
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

checkpoint = torch.load(path + 'final_data/' + 'correct_model_fin.pt') # 학습한 파일 경로 지정
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

In [None]:
model.eval()

## 맞춤법 처리한 제출파일 제출 형식으로 바꾸기

In [None]:
test = pd.read_csv(path + 'final_data/' + 'correct_sub_fin.csv', encoding = 'utf-8-sig')

In [None]:
dataset_test = [[str(a), '0'] for a in test['clean_done']]
dataset_test[:20]

In [None]:
from tqdm.notebook import tqdm

In [None]:
# 예측 함수 생성
# Setting parameters
# train 학습 모델 설정할 때와 동일하게 설정
def predict_set(dataset_test):

    test_acc = 0.0

    tokenizer = get_tokenizer()
    tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

    data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=4)

    out_list =[]

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = model(token_ids, valid_length, segment_ids)
        output = out.detach().cpu().tolist()
        out_list.append(output)

    pd = sum(out_list,[])
    pd_list = pd_list = [np.argmax(i) for i in pd]
    return pd_list

In [None]:
p_test = predict_set(dataset_test)

In [None]:
import pickle
# 학습하기전 기존 y값 사전 열기
with open(path+'final_data/' + 's_dictionary', 'rb') as f:
    s_dict = pickle.load(f)

In [None]:
test['predict_y'] = p_test
# 1. 모델 학습하기 전 기존 y값 변수로 변환
test['predict_y'] = test['predict_y'].astype('str').map(s_dict) 
cols = ['digit_1', 'digit_2', 'digit_3']
# 2. 코드북에서 소분류를 통해 대/중분류 함께 예측
test[cols] = test['predict_y'].astype('int64').map(dict_fin).apply(lambda x: pd.Series(x))

In [None]:
test_fin = test[['AI_id', 'digit_1', 'digit_2', 'digit_3', 'text_obj', 'text_mthd', 'text_deal']]
test_fin.to_csv(path + 'final_data/' + 'submission_fin_0413.csv', index=False, encoding='EUC-KR')

In [None]:
test_fin[:50]