In [None]:
#패키지 설치
!pip install mxnet
!pip install gluonnlp pandas tqdm
!pip install sentencepiece
!pip install transformers==3
!pip install torch
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
#라이브러리,모델 설치
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, notebook

from torch.nn import init

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
import matplotlib.pyplot as plt

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit,StratifiedKFold
import random
import gc
import unicodedata

gc.collect()

In [None]:
############################################################################
############################################################################
############################################################################

##GPU 사용 시
# there are totally five GPU in server,we can routed to 0:4.
device = torch.device("cuda:4")
gc.collect()

#BERT 모델 불러오기
bertmodel, vocab = get_pytorch_kobert_model()

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

KoBert 모델 클래스 생성

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx,label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = []
        self.labels = []
#         for i in dataset:
#             self.sentences.append(transform([i[sent_idx]]))
#             self.labels.append(i[label_idx])
        for i in dataset:
            if len(i[sent_idx])<=max_len:
                self.sentences.append(transform([i[sent_idx]]))
                self.labels.append(i[label_idx])
            else:
                self.sentences.append(transform([i[sent_idx][:max_len]]))
                self.labels.append(i[label_idx])
        

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=11, #클래스 수
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
# define hyper parameter for finetuning
max_len = 512
batch_size = 6
warmup_ratio = 0.1
num_epochs = 20
max_grad_norm = 1
log_interval = 20
learning_rate =  5e-6  #5e-5  2e-5
num_workers = 2
n_splits = 5
model_name = 'kobertbest_512.pt'

device = torch.device("cuda:4")

model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
# torch.save(model, 'kobert.pt')

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
torch.save({'modelA':model.state_dict(),'optimizerA':optimizer.state_dict()},'kobert.pt')

loss_fn = nn.CrossEntropyLoss()

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

##################################################################################################
# train test validate split

# df_raw = pd.read_csv("extract100datasetmanual.csv?dl=1")
# df_raw = pd.read_csv("dataset200_12manually.csv")

#df_raw = pd.read_csv("12classesdatsetnew.csv",index_col=False )

df_raw = pd.read_excel('11classesdatsetnew_20220208.xlsx', engine='openpyxl', sheet_name="classesdatsetnew", header=0)

# df_raw = pd.read_csv("dataset200_13manually2.csv")
# print(df_raw)
# categorylist = ["화장품","핫플레이스","요리음식","여행아웃도어","인테리어","엔터테인먼트"]
categorylist = ["화장품","패션","요리음식","여행아웃도어","인테리어","엔터테인먼트","육아","아이티","자동차","헬스/피트니스","반려동물"]
temp_label=[]
for i in df_raw['label']:
    temp_label.append(categorylist.index(i))
df_raw['label']=temp_label

#데이터 test/val 분할
train_test_ratio = 0.8
df_train_list =[None]*11
df_test_list = [None]*11

df_file =[None] *11

for i, labeli in enumerate(range(len(categorylist))):
    df_file[i] = df_raw[df_raw['label'] == labeli]

for i, dffilei in enumerate(df_file):
    df_train_list[i],df_test_list[i] = train_test_split(dffilei,train_size = train_test_ratio, random_state = 1)

df_train = pd.concat([trainlist for trainlist in df_train_list],ignore_index=True,sort=False)
df_test = pd.concat([dftest for dftest in df_test_list],ignore_index=True,sort=False)

import re

df_traindata = df_train.reindex(columns=['label', 'caption'])
# df_traindata = df_train.reindex(columns=['label', 'hashtag'])
# print(df_traindata)
df_trainlabel = df_train.reindex(columns=['label'])
np_data = df_traindata.to_numpy()
np_label = df_trainlabel.to_numpy()

for i in range(len(np_data)):
    np_data[i][1] = unicodedata.normalize('NFC',np_data[i][1])
    np_data[i][1] = ' '.join(re.compile('[가-힣]+').findall(np_data[i][1]))

df_testdata = df_test.reindex(columns=['label', 'caption'])
# df_testdata = df_test.reindex(columns=['label', 'hashtag'])
df_testlabel = df_test.reindex(columns=['label'])
np_testdata = df_testdata.to_numpy()
np_testlabel = df_testlabel.to_numpy()

for i in range(len(np_testdata)):
    np_testdata[i][1] = unicodedata.normalize('NFC',np_testdata[i][1])
    np_testdata[i][1] = ' '.join(re.compile('[가-힣]+').findall(np_testdata[i][1]))

print(np_testdata)


In [None]:
# groups = np.array([random.randint(0,6) for i in range(0,600)])
# gss = GroupShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
skf = StratifiedKFold(n_splits=n_splits)
#StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
nsplit =0
oldaccu = 0.0
trainarray = []
testarray =[]
# for trainidx, testidx in gss.split(np_data, np_label, groups=groups):
for trainidx,testidx in skf.split(np_data,np_label):
  # print("TRAIN:", trainidx, "TEST:", testidx)
  x_train,x_test,y_train,y_test = np_data[trainidx],np_data[testidx],np_label[trainidx],np_label[testidx]
  # print(x_train)
  # print(x_test)
  # print("%s %s" % (trainidx,testidx))
  data_train = BERTDataset(x_train, 1, 0, tok, max_len, True, False)
#   print("trainidx: ", len(trainidx))
#   print("x_train: ", len(x_train))
#   print("data_train: ", len(data_train))
  data_test = BERTDataset(x_test, 1, 0, tok, max_len, True, False)
  train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, shuffle=True, num_workers=num_workers) # before the num_workers = 5
  test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, shuffle=True, num_workers=num_workers)  
  # load the initial model parameters
  checkpoint = torch.load('kobert.pt')
  model.load_state_dict(checkpoint['modelA'])
  optimizer.load_state_dict(checkpoint['optimizerA'])
  model.to(device)

  t_total = len(train_dataloader) * num_epochs
  warmup_step = int(t_total * warmup_ratio)
  scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
  trainacclist =[]
  testacclist =[]
  # model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)
  # optimizer.step()
  # print("train_dataloader: ", len(train_dataloader))
 
  for e in range(num_epochs):
      train_acc = 0.0
      test_acc = 0.0
      model.train()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(notebook.tqdm(train_dataloader)):
          optimizer.zero_grad()
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          loss = loss_fn(out, label)
          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
          optimizer.step()
          scheduler.step()  # Update learning rate schedule
          train_acc += calc_accuracy(out, label)
          if batch_id % log_interval == 0:
              # print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
              print('Epoch {}/{} Batch {}/{} Cost: {:.6f} Train Acc {}'.format(e+1, num_epochs, batch_id+1, len(train_dataloader), loss.item(), train_acc / (batch_id+1)))
#       print("batch id: ", batch_id)
#       print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
      print('\nEpoch {}/{} Cost: {:.6f} Train Acc {}'.format(e+1, num_epochs, loss.item(), train_acc / len(train_dataloader)))
      trainacclist.append(train_acc / (batch_id+1))
      model.eval()
      for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(notebook.tqdm(test_dataloader)):
          token_ids = token_ids.long().to(device)
          segment_ids = segment_ids.long().to(device)
          valid_length= valid_length
          label = label.long().to(device)
          out = model(token_ids, valid_length, segment_ids)
          test_acc += calc_accuracy(out, label)
      # print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
      print('\nEpoch {}/{} Cost: {:.6f} Test Acc {}'.format(e+1, num_epochs, loss.item(), test_acc / len(test_dataloader)))
      testacclist.append(test_acc / (batch_id+1))
      newaccu = test_acc / (batch_id+1)
      if newaccu > oldaccu:
        oldaccu = newaccu
        torch.save(model, model_name)
      gc.collect()  
  plt.figure()
  print(trainacclist)
  trainarray.append(trainacclist[-1])
  print(testacclist)
  testarray.append(testacclist[-1])
  x = np.arange(len(trainacclist))
  plt.plot(x,trainacclist,'b',label='train')
  plt.plot(x,testacclist,'g',label='validate')
  # plt.axis('equal')
  plt.xlabel('epochs')
  plt.ylabel('acc')
  plt.title('SNS dataset with caption')
  # plt.setp(lines, color='r', linewidth=2.0)
  plt.legend()
  plt.savefig('{}.png'.format(nsplit),format='png')
  nsplit +=1

plt.figure()
x = np.arange(len(trainarray))
plt.plot(x,trainarray,'b',label='train')
plt.plot(x,testarray,'g',label='validate')
# plt.axis('equal')
plt.xlabel('K Folds')
plt.ylabel('avg. acc')
plt.title('SNS dataset with caption according to K-Folds')
# plt.setp(lines, color='r', linewidth=2.0)
plt.legend()
plt.savefig('{}.png'.format(nsplit),format='png')
nsplit +=1

print(np.array(trainarray))
print(np.array(testarray))

# pd.DataFrame(np.array(trainarray)).to_csv('trainaccu.csv')
# pd.DataFrame(np.array(testarray)).to_csv('testaccu.csv')

In [None]:
def precision_at_K (X,Y,K, np_testdata):
  max_vals1,max_indicesK = torch.topk(X,K)
  # print(max_indicesK)
  precisionnum = 0
  index = 0
  for yi, maxindicesi in zip(Y,max_indicesK):
    precisionnum += yi in maxindicesi
    if yi != maxindicesi:
        print("")
        print(yi, maxindicesi)
        print(np_testdata[index])
        print("")
    index += 1
  
  #maxlength = max_indicesK.size()[0] #precision_acc
  return precisionnum

In [None]:
data_test = BERTDataset(np_testdata, 1, 0, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=num_workers) 
gc.collect()
from sklearn.metrics import confusion_matrix

# tensor = torch.ones(())
# wholelabel =tensor.new_empty()
# wholeout =tensor.new_empty()
wholelabel =[]
wholeout =[]
categorylist = ["화장품","패션","요리음식","여행아웃도어","인테리어","엔터테인먼트","육아","아이티","자동차","헬스/피트니스","반려동물"]
modelbest = torch.load(model_name)
modelbest.to(device)
modelbest.eval()
precision_at_3 =0.0
test_acc1 =0.0
start = 0
end = start + batch_size
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(notebook.tqdm(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = modelbest(token_ids, valid_length, segment_ids)
        labellist = [categorylist[labeli] for labeli in label]
        outlist = [categorylist[outi.argmax()] for outi in out]
        wholelabel+=labellist
        wholeout+=outlist
        test_acc1 += calc_accuracy(out, label)
        precision_at_3 += precision_at_K (out,label,1, np_testdata[start:end])
        start += batch_size
        end = start + batch_size

# print(test_acc1*(batch_id+1)/len(data_test))
print(precision_at_3)
# print(totallength)
print((len(data_test)))
print("Total Acc", precision_at_3/len(data_test))
confusion_matrix(wholelabel, wholeout,labels= categorylist)
np.save('confusionmatrix',confusion_matrix(wholelabel, wholeout,labels= categorylist))
confusion_matrix(wholelabel, wholeout,labels= categorylist)