In [1]:
import argparse
import json
from copy import deepcopy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.style.use("ggplot")

from Utils.dataset import *
from Utils.utils import *
from Models.BertClf import *
from Models.LstmClf import *
from Models.ElectraClf import *
device = torch.device("cuda:" + str(0)) if torch.cuda.is_available() else torch.device("cpu")
print(f'training device: {device, torch.cuda.get_device_name()}')

training device: (device(type='cuda', index=0), 'GeForce GTX 1660 Ti')


In [50]:
# Define signature
signature = "jh_BILSTM_6M_1D_15H_10M"_opt

In [60]:
# Define signature
signature = "jh_BERT_6M_1D_20H_8M"

In [33]:
# Define signature   ----> 수정 필요
#signature = "jh_ELECTRA_5M_31D_17H_5M"
signature = "sw_focal_ELECTRA_6M_1D_18H_48M"

In [34]:
# Load options
parser = argparse.ArgumentParser()
#     opt = parser.parse_args() # in .py env
opt, _ = parser.parse_known_args() # in .ipynb env
with open('./Saved_models/' + signature + '_opt.txt', 'r') as f:
    opt.__dict__ = json.load(f)
print(opt)

Namespace(aug=0, author='sw_focal', batch_size=16, data_path='./Dataset', dropout=0.5, embedding_dim=256, eps=1e-08, freeze_pretrained=0, gpu=0, hidden_dim=768, kernel_depth=500, kernel_sizes=[3, 4, 5], lr_clf=0.0001, lr_pretrained=1e-05, max_epoch=30, max_len=50, model='ELECTRA', num_layer=2, save=1, save_model_path='./Saved_models', save_submission_path='./Submissions', sent_embedding=0, signature='sw_focal_ELECTRA_6M_1D_18H_48M', split_ratio=1, weight_decay=0.0005)


In [35]:
valid_dataset = ValidDataset(opt)

Tokenizing data...
Apply the ElectraTokenizer...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


valid_X_ids_tsr.shape: torch.Size([1748, 50])
valid_X_masks_tsr.shape: torch.Size([1748, 50])


In [36]:
valid_dataloader = DataLoader(valid_dataset,batch_size=opt.batch_size,shuffle=False)
#len(valid_dataset)opt.batch_size

In [37]:
# Load untrained model
if opt.model == "BERT":
    model = BertClassifier(opt).to(device)
elif opt.model == "ELECTRA":
    model = ElectraClassifier(opt).to(device)
elif opt.model == "BILSTM":
    model = LstmClassifier(opt,30522).to(device)
elif opt.model =='ConvClassifier':
    model = ConvClassifier(opt,30522).to(device)

In [38]:
# Load trained model
model_save_path = str(opt.save_model_path) + "/" + opt.signature +'.model'
model.load_state_dict(torch.load(model_save_path))
model.to(device)

ElectraClassifier(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [39]:
model.eval()
all_logits = []
all_label=[]
for batch in valid_dataloader:
    # Load batch to GPU
    b_ids_tsr, b_masks_tsr, b_label_tsr = tuple(tsr.to(device) for tsr in batch)
    with torch.no_grad():
        if opt.model in ["BILSTM", "CNN"]:
            logits = model(b_ids_tsr)
        else:
            logits = model(b_ids_tsr, b_masks_tsr)
    all_logits.append(logits)
    all_label.append(b_label_tsr)
all_logits = torch.cat(all_logits, dim=0)
all_label = torch.cat(all_label, dim=0)

In [40]:
# Apply softmax to calculate probabilities
probs = F.softmax(all_logits, dim=1).cpu().numpy()
preds = np.argmax(probs, axis=1)
valid_dataset.df['Pred']=preds

In [41]:
# save the predict
if opt.model == "BERT":
    valid_dataset.df.to_csv('./Dataset/BERT_pred.csv',index=False)
    #np.save('./Dataset/BERT_pred',preds)
elif opt.model == "ELECTRA":
    valid_dataset.df.to_csv('./Dataset/ELECTRA_pred_1.csv',index=False)
    #np.save('./Dataset/ELECTRA_pred',preds)
elif opt.model == "BILSTM":
    valid_dataset.df.to_csv('./Dataset/BILSTM_pred.csv',index=False)
    #np.save('./Dataset/BILSTM_pred',preds)

In [2]:
df_bert = pd.read_csv('./Dataset/BERT_pred.csv')
df_electra=pd.read_csv('./Dataset/ELECTRA_pred.csv')
df_bilstm=pd.read_csv('./Dataset/BILSTM_pred.csv')

In [3]:
df_bert.rename(columns = {'Pred': 'b_pred'}, inplace = True)
df_electra.rename(columns = {'Pred': 'e_pred'}, inplace = True)
df_bilstm.rename(columns = {'Pred': 'l_pred'}, inplace = True)

In [4]:
merge_bert_electra=pd.merge(df_bert,df_electra,how='inner',on=['Sentence','Category','Id'])
merge_total=pd.merge(merge_bert_electra,df_bilstm,how='inner',on=['Sentence','Category','Id'])

In [7]:
## electra는 맞히는데 bilstm은 못 맞히는거
merge_t_f=merge_total[merge_total['e_pred']==merge_total['Category']] ##bert와 electra가 맞힌 경우
merge_t_f=merge_t_f[merge_t_f['l_pred']!=merge_t_f['Category']] ##bilstm가 못 맞힌 경우
merge_t_f.to_csv('./Dataset/ELECTRA_CORRECT.csv',index=False)

In [8]:
## electra는 못 맞히는데 bilstm은 맞히는거
merge_f_t=merge_total[merge_total['e_pred']!=merge_total['Category']] ##bert와 electra가 못맞힌 경우
merge_f_t=merge_f_t[merge_f_t['l_pred']==merge_f_t['Category']] ##bilstm이 밎힌 경우
merge_f_t.to_csv('./Dataset/BILSTM_CORRECT.csv',index=False)

In [45]:
## bert와 electra는 맞히는데 bilstm은 못 맞히는거
merge_t_f=merge_total[merge_total['b_pred']==merge_total['e_pred']] ## bert와 electra 예측 같은 경우
merge_t_f=merge_t_f[merge_t_f['b_pred']==merge_t_f['Category']] ##bert와 electra가 맞힌 경우
merge_t_f=merge_t_f[merge_t_f['l_pred']!=merge_t_f['Category']] ##bilstm가 못 맞힌 경우
merge_t_f.to_csv('./Dataset/BERT_ELECTRA_CORRECT.csv',index=False)

In [46]:
## bert와 electra, bilstm은 모두 맞히는거
merge_t_t=merge_total[merge_total['b_pred']==merge_total['e_pred']] ## bert와 electra 예측 같은 경우
merge_t_t=merge_t_t[merge_t_t['b_pred']==merge_t_t['Category']] ##bert와 electra가 맞힌 경우
merge_t_t=merge_t_t[merge_t_t['l_pred']==merge_t_t['Category']] ##bilstm도 맞힌 경우
merge_t_t.to_csv('./Dataset/BERT_ELECTRA_BILSTM_CORRECT.csv',index=False)

In [47]:
## bert와 electra는 못 맞히는데 bilstm은 맞히는거
merge_f_t=merge_total[merge_total['b_pred']==merge_total['e_pred']] ## bert와 electra 예측 같은 경우
merge_f_t=merge_f_t[merge_f_t['b_pred']!=merge_f_t['Category']] ##bert와 electra가 못맞힌 경우
merge_f_t=merge_f_t[merge_f_t['l_pred']==merge_f_t['Category']] ##bilstm이 밎힌 경우
merge_f_t.to_csv('./Dataset/BILSTM_CORRECT.csv',index=False)

In [None]:
valid_dataset_df = pd.read_csv(opt.data_path +'/valid_dd_ratio_' + str(opt.split_ratio) + '.csv')
valid_X_arr = valid_dataset_df.Sentence.values
valid_y_arr = valid_dataset_df.Category.values
## validdataset class에 dataframe 저장되게 해도 된다.
valid_X_ids_tsr, valid_X_masks_tsr = preprocessing_for_bert(valid_X_arr, opt)
ids_tsr = valid_X_ids_tsr
masks_tsr = valid_X_masks_tsr
labels = torch.LongTensor(valid_y_arr)