In [None]:
!pip install mxnet-cu101
!pip install gluonnlp pandas tqdm
!pip install sentencepiece==0.1.85
!pip install transformers==2.1.1
!pip install torch==1.3.1

Collecting mxnet-cu101
  Downloading mxnet_cu101-1.7.0-py2.py3-none-manylinux2014_x86_64.whl (846.0 MB)
[K     |████████████████████████████████| 846.0 MB 15 kB/s s eta 0:00:01     |███████▎                        | 193.6 MB 1.6 MB/s eta 0:06:44     |████████▎                       | 219.2 MB 4.1 MB/s eta 0:02:35     |████████████▍                   | 327.6 MB 8.0 MB/s eta 0:01:06
[?25hCollecting graphviz<0.9.0,>=0.8.1
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet-cu101


In [None]:
!pip install git+https://git@github.com/SKTBrain/KoBERT.git@master

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook

In [None]:
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

In [None]:
from transformers import AdamW
from transformers.optimization import WarmupLinearSchedule

In [None]:
##GPU 사용 시
device = torch.device("cuda:0")

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

In [None]:
!wget https://www.dropbox.com/s/374ftkec978br3d/ratings_train.txt?dl=1
!wget https://www.dropbox.com/s/977gbwh542gdy94/ratings_test.txt?dl=1

In [None]:
dataset_train = nlp.data.TSVDataset("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/dacon_train.txt", field_indices=[1,2], num_discard_samples=1)
dataset_test = nlp.data.TSVDataset("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/dacon_test.txt", field_indices=[1,2], num_discard_samples=1)
#dataset_submission = nlp.data.TSVDataset("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/dacon_submission.txt", field_indices=[1,2], num_discard_samples=1)

In [None]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))


In [None]:
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
data_train = BERTDataset(dataset_train, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)

In [None]:
train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5, shuffle=False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5, shuffle=False)

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
model = BERTClassifier(bertmodel,  dr_rate=0.5).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

In [None]:
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

In [None]:
t_total = len(train_dataloader) * num_epochs
warmup_step = int(t_total * warmup_ratio)

In [None]:
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_step, t_total=t_total)

In [None]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [None]:
for e in range(num_epochs):
    train_acc = 0.0
    test_acc = 0.0
    model.train()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
        optimizer.zero_grad()
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        loss = loss_fn(out, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        train_acc += calc_accuracy(out, label)
        if batch_id % log_interval == 0:
            print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
    print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
    model.eval()
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        test_acc += calc_accuracy(out, label)
    print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))

## submission data 

In [231]:
dataset_submission = nlp.data.TSVDataset("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/dacon_submission.txt", field_indices=[1,2])

In [233]:
data_submission = BERTDataset(dataset_submission, 0, 1, tok, max_len, True, False)

In [234]:
submission_dataloader = torch.utils.data.DataLoader(data_submission, batch_size=1, num_workers=5, shuffle=False)

In [248]:
score_dict ={}
## Setting parameters
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5



for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(submission_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length= valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    test_acc += calc_accuracy(out, label)
    
    score_dict[dataset_submission[batch_id][1]] = out.cpu().data.numpy()[0][1]
    
    #if batch_id%100 == 0:
    #    print(batch_id/len(dataset_submission)*100)
    """print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
    print(token_ids)
    print(label)
    print(dataset_submission[batch_id][1])
    """

HBox(children=(IntProgress(value=0, max=133864), HTML(value='')))

0.0
0.07470268332038488
0.14940536664076975
0.2241080499611546
0.2988107332815395
0.37351341660192433
0.4482160999223092
0.522918783242694
0.597621466563079
0.6723241498834638
0.7470268332038487
0.8217295165242335
0.8964321998446184
0.9711348831650033
1.045837566485388
1.120540249805773
1.195242933126158
1.2699456164465428
1.3446482997669276
1.4193509830873126
1.4940536664076973
1.5687563497280823
1.643459033048467
1.718161716368852
1.7928643996892368
1.8675670830096216
1.9422697663300066
2.0169724496503916
2.091675132970776
2.166377816291161
2.241080499611546
2.3157831829319306
2.390485866252316
2.4651885495727006
2.5398912328930856
2.6145939162134706
2.689296599533855
2.76399928285424
2.838701966174625
2.9134046494950097
2.9881073328153946
3.0628100161357796
3.1375126994561646
3.2122153827765496
3.286918066096934
3.361620749417319
3.436323432737704
3.5110261160580887
3.5857287993784737
3.6604314826988587
3.735134166019243
3.809836849339628
3.884539532660013
3.9592422159803977
4.03394

In [250]:
sub_data.head()

Unnamed: 0,0,1,2
0,500733466000,▲ 석문간척지 임차법인협의회가 한국농어촌공사 당진지사 앞에 공공비축벼 320t을 쌓...,500733466000
1,500733466001,석문간척지 임차법인협의회(이하 간척지협의회)가 농림축산식품부의 부당한 간척지 임대료...,500733466001
2,500733466002,43개 영농조합법인이 소속된 간척지협의회는 이번 벼 야적 시위를 통해 현재 1kg당...,500733466002
3,500733466003,이들은 지난 12월 7일 농림축산식품부에 탄원서를 제출했지만 “임대료 인하는 올해 ...,500733466003
4,500733466004,게다가 임차법인들의 계약기간이 올해 만료되기 때문에 임대료를 인하해도 지난 2년 동...,500733466004


In [262]:
sub_data = pd.read_csv("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/dacon_submission.txt", sep="\t", header=None)
sub_data["score"] = sub_data[2].map(lambda x: score_dict[str(x)])
sub_data["sent_num"] = sub_data[2].map(lambda x: str(x)[:-3])


sub_data["rank"] = sub_data.groupby("sent_num")["score"].rank(method="min", ascending=False)

result_dict={}

sent_list = list(set(sub_data["sent_num"]))

for sent in sent_list:
    tmp = sub_data[sub_data["sent_num"]==sent]
    
    answer = tmp[tmp["rank"]==1][1].item() + "\n" + tmp[tmp["rank"]==2][1].item() + "\n" + tmp[tmp["rank"]==3][1].item()
    
    result_dict[sent] = answer

  from ipykernel import kernelapp as app


In [261]:
sub_data.head()

Unnamed: 0,0,1,2,score,sent_num,rank
0,500733466000,▲ 석문간척지 임차법인협의회가 한국농어촌공사 당진지사 앞에 공공비축벼 320t을 쌓...,500733466000,-1.49438,500733466,8.0
1,500733466001,석문간척지 임차법인협의회(이하 간척지협의회)가 농림축산식품부의 부당한 간척지 임대료...,500733466001,0.613837,500733466,1.0
2,500733466002,43개 영농조합법인이 소속된 간척지협의회는 이번 벼 야적 시위를 통해 현재 1kg당...,500733466002,-0.378298,500733466,2.0
3,500733466003,이들은 지난 12월 7일 농림축산식품부에 탄원서를 제출했지만 “임대료 인하는 올해 ...,500733466003,-1.192961,500733466,6.0
4,500733466004,게다가 임차법인들의 계약기간이 올해 만료되기 때문에 임대료를 인하해도 지난 2년 동...,500733466004,-1.013031,500733466,4.0


In [270]:
sub_fi = pd.read_csv("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/extractive_sample_submission_v2.csv")
sub_fi["summary"] = sub_fi["id"].map(lambda x: result_dict[str(x)])
sub_fi.to_csv("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/ext_sub_order.csv", index=False)

### 순서대로 변경 

In [266]:
sub_data["imp_sentence"] = sub_data["rank"].map(lambda x: 1 if x <=3 else 0)

In [271]:
for sent in sent_list:
    tmp = sub_data[sub_data["sent_num"]==sent]
    
    answer = "\n".join(list(tmp[tmp["imp_sentence"]==1][1]))
    
    result_dict[sent] = answer

In [272]:
sub_fi = pd.read_csv("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/extractive_sample_submission_v2.csv")
sub_fi["summary"] = sub_fi["id"].map(lambda x: result_dict[str(x)])
sub_fi.to_csv("/home/miyoun/sangwon.joo/dacon/KoBERT/dacon_ext_summary/data/ext_sub_nonorder.csv", index=False)

In [269]:
sub_fi

Unnamed: 0,id,summary
0,500733466,석문간척지 임차법인협의회(이하 간척지협의회)가 농림축산식품부의 부당한 간척지 임대료...
1,500742482,신 벌떼해장국이 손님들의 성원에 보답하고자 24시간 영업을 재개한다.\nIMF 당시...
2,500742484,한국석유공사가 운영하는 오피넷(www.opinet.co.kr)에 따르면 최근 송악읍...
3,504213810,어기구 국회의원이 천연가스의 안정적 수급을 위해 2020년 착공이 예정돼 있는 액화...
4,505279620,당진시정책자문위원회가 유명무실하다는 비판이 지속적으로 제기되고 있다.\n민선6기가 ...
...,...,...
9982,745338220,시는 2022년 준공을 목표로 내년 설계를 시작으로 2021년부터 한국해양교통안전공...
9983,745367988,청와대는 30일 문재인 대통령의 ‘1호 공약’인 고위공직자범죄수사처(공수처) 설치법...
9984,745368130,광주지역 광공업 생산 감소율이 14개월 만에 최고를 기록했다.\n30일 호남지방통계...
9985,745368136,아름다운 가게 용봉점 헌책방이 개점 10년만에 문을 닫는 다.\n2019년의 마지막...
