In [1]:
import pandas as pd
import numpy as np
import collections
import re
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import log_loss, roc_auc_score
import warnings
import torch.nn as nn
from tqdm import tqdm
import random
import gensim
from torchcontrib.optim import SWA
import os
from torch.utils import data
from torch import nn
import torch.nn.functional as F
from torch.optim import *
torch.set_printoptions(edgeitems=768)
warnings.filterwarnings("ignore")
np.set_printoptions(threshold=np.inf)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# 设置基本参数
MAX_LEN = 100
BATCH_SIZE = 16
SEED = 9797
NAME = 'HAN'
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if DEVICE=='cuda':
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
DEVICE

'cuda'

In [2]:
train_data = pd.read_csv('../data/track1_round1_train_20210222.csv',header=None)
test_data = pd.read_csv('../data/track1_round1_testB.csv',header=None) 
train_data.columns=['report_ID','description','label']
test_data.columns=['report_ID','description']

temp=[i[:-1] for i in train_data['report_ID'].values]
train_data['report_ID']=temp
temp=[i[:-1] for i in test_data['report_ID'].values]
test_data['report_ID']=temp

temp=[i.strip('|').strip() for i in train_data['description'].values]
train_data['description']=temp
temp=[i.strip('|').strip() for i in test_data['description'].values]
test_data['description']=temp

temp_label=[i.strip('|').strip() for i in train_data['label'].values]
train_data['label']=temp_label
train_data

Unnamed: 0,report_ID,description,label
0,0,623 328 538 382 399 400 478 842 698 137 492 26...,2
1,1,48 328 538 382 809 623 434 355 382 382 363 145...,
2,2,623 656 293 851 636 842 698 493 338 266 369 69...,15
3,3,48 328 380 259 439 107 380 265 172 470 290 693...,
4,4,623 328 399 698 493 338 266 14 177 415 511 647...,16
...,...,...,...
9995,9995,290 380 247 263 48 328 697 582 91 400 478 842 ...,0 7 15
9996,9996,852 611 501 582 177 230 294 39 363 180 519 421...,10
9997,9997,852 328 290 380 256 544 636 90 735 374 698 116...,
9998,9998,852 328 305 461 382 697 259 779 59 261 589 693...,16


In [3]:
all_sentences = pd.concat([train_data['description'],test_data['description']]).reset_index(drop=True)
all_sentences.drop_duplicates().reset_index(drop=True, inplace=True)
all_sentences = all_sentences.apply(lambda x:x.split(' ')).tolist()
if not os.path.exists('w2v.model'): 
    w2v_model = gensim.models.word2vec.Word2Vec(all_sentences, sg=1, size=300,window=5,min_count=1,negative=3,sample=0.001,hs=1,seed=452)
    w2v_model.save('w2v.model')
else:
    w2v_model = gensim.models.word2vec.Word2Vec.load("w2v.model")
    
if not os.path.exists('fasttext.model'): 
    fasttext_model = gensim.models.FastText(all_sentences, seed=452, size=100, min_count=1, iter=20, window=3)
    fasttext_model.save('fasttext.model')
else:
    fasttext_model = gensim.models.word2vec.Word2Vec.load("fasttext.model")

In [4]:
train_dataset = []
for i in tqdm(range(len(train_data))):
    train_dict = {}
    train_dict['report_ID'] = train_data.loc[i, 'report_ID']
    train_dict['description'] = train_data.loc[i, 'description']
    train_dict['label'] = train_data.loc[i, 'label']
    train_dataset.append(train_dict)
test_dataset = []
for i in tqdm(range(len(test_data))):
    test_dict = {}
    test_dict['report_ID'] = test_data.loc[i, 'report_ID']
    test_dict['description'] = test_data.loc[i, 'description']
    test_dict['label'] = ''
    test_dataset.append(test_dict)

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [5]:
class DataSet(data.Dataset):
    def __init__(self, data, mode='train'):
        self.data = data
        self.mode = mode
        self.dataset = self.get_data(self.data,self.mode)
        
    def get_data(self, data, mode):
        dataset = []
        global s
        for data_li in tqdm(data):
            description = data_li['description'].split(' ')
            description = [w2v_model.wv.vocab[s].index+1 if s in w2v_model.wv.vocab else 0 for s in description]
            if len(description) < MAX_LEN:
                description += [0] * (MAX_LEN - len(description))
            else:
                description = description[:MAX_LEN]
            label = self.get_dumm(data_li['label'])
            dataset_dict = {'description':description, 'label':label}
            dataset.append(dataset_dict)
        return dataset
    
    def get_dumm(self,s):
        re = [0] * 17
        if s == '':
            return re
        else:
            tmp = [int(i) for i in s.split(' ')]
            for i in tmp:
                re[i] = 1
        return re
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        data = self.dataset[idx]
        description = torch.tensor(data['description'])
        if self.mode == 'test':
            return description
        else:
            label = torch.tensor(data['label'])
            return description, label

def get_dataloader(dataset, mode):
    torchdata = DataSet(dataset, mode=mode)
    if mode == 'train':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=True, num_workers=0, drop_last=True)
    elif mode == 'test':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    elif mode == 'valid':
        dataloader = torch.utils.data.DataLoader(torchdata, batch_size=BATCH_SIZE, shuffle=False, num_workers=0, drop_last=False)
    return dataloader, torchdata

train_dataloader, train_torchdata = get_dataloader(train_dataset, mode='train')
test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')

HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




In [6]:
import torch.nn.functional as F
from torch import nn
class SelfAttention(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SelfAttention, self).__init__()
        self.W = nn.Linear(input_size, hidden_size, True)
        self.u = nn.Linear(hidden_size, 1)
    def forward(self, x):
        u = torch.tanh(self.W(x))
        a = F.softmax(self.u(u), dim=1)
        x = a.mul(x).sum(1)
        return x

class HAN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_classes=17, embeddings=None):
        super(HAN, self).__init__()
        self.num_classes = num_classes
        hidden_size_gru = 256
        hidden_size_att = 512
        hidden_size = 128
        self.num_words = MAX_LEN
        self.embed = nn.Embedding(vocab_size, embedding_dim)
        if embeddings:
            w2v_model = gensim.models.word2vec.Word2Vec.load("w2v.model").wv
            fasttext_model = gensim.models.word2vec.Word2Vec.load("fasttext.model").wv
            w2v_embed_matrix = w2v_model.vectors
            fasttext_embed_matrix = fasttext_model.vectors
#             embed_matrix = w2v_embed_matrix         
            embed_matrix = np.concatenate([w2v_embed_matrix, fasttext_embed_matrix], axis=1)
            oov_embed = np.zeros((1, embed_matrix.shape[1]))
            embed_matrix = torch.from_numpy(np.vstack((oov_embed,embed_matrix)))
            self.embed.weight.data.copy_(embed_matrix)
            self.embed.weight.requires_grad = False
        self.gru1 = nn.GRU(embedding_dim, hidden_size_gru, bidirectional=True, batch_first=True)
        self.att1 = SelfAttention(hidden_size_gru * 2, hidden_size_att)
        self.gru2 = nn.GRU(hidden_size_att, hidden_size_gru, bidirectional=True, batch_first=True)
        self.att2 = SelfAttention(hidden_size_gru * 2, hidden_size_att)
        self.tdfc = nn.Linear(embedding_dim, embedding_dim)
        self.tdbn = nn.BatchNorm2d(1)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size_att,hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_size,num_classes)
        )
        self.dropout = nn.Dropout(0.5)
    def forward(self, x, label=None):
        # 64 512 200
        x = x.view(x.size(0) * self.num_words, -1).contiguous()
        x = self.dropout(self.embed(x))
        x = self.tdfc(x).unsqueeze(1)
        x = self.tdbn(x).squeeze(1)
        x, _ = self.gru1(x)
        x = self.att1(x)
        x = x.view(x.size(0) // self.num_words, self.num_words, -1).contiguous()
        x, _ = self.gru2(x)
        x = self.att2(x)
        out = self.dropout(self.fc(x))
        if label is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(out.view(-1,self.num_classes).float(), label.view(-1,self.num_classes).float())
            return loss
        else:
            return out

In [7]:
def metric_mlogloss(label,pred):
    score = 0
    for i in range(len(pred)):
        for j in range(17):
            if pred[i][j] == 0:
                pred[i][j] +=1e-10
            elif pred[i][j] == 1:
                pred[i][j] -=1e-10
            score += label[i][j]*np.log(pred[i][j])+(1-label[i][j])*np.log(1-pred[i][j])
    score /= (len(pred)*17*(-1))
    return 1-score

def validation_funtion(model, valid_dataloader, valid_torchdata, mode='valid'):
    model.eval()
    pred_list = []
    labels_list = []
    if mode == 'valid':
        for i, (description, label) in enumerate(tqdm(valid_dataloader)):
            output = model(description.to(DEVICE))
            pred_list += output.sigmoid().detach().cpu().numpy().tolist()
            labels_list += label.detach().cpu().numpy().tolist()
        auc = roc_auc_score(labels_list,pred_list, multi_class='ovo')
        logloss = log_loss(labels_list, pred_list)
        mlogloss = metric_mlogloss(labels_list, pred_list)
        return mlogloss, auc, logloss
    else:
        for i, (description) in enumerate(tqdm(valid_dataloader)):
            output = model(description.to(DEVICE))
            pred_list += output.sigmoid().detach().cpu().numpy().tolist()
        return pred_list
    
                            
def train(model, train_dataloader, valid_dataloader, valid_torchdata, epochs, early_stop=None):
    global logger
#     ema = EMA(model, 0.999)
#     ema.register()
    param_optimizer = list(model.named_parameters())
    embed_pa = ['embed.weight']
    optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in embed_pa)]},
                                    {'params': model.embed.parameters(), 'lr': 2e-5}]
    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-3, amsgrad=True, weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=3, T_mult=2, eta_min=1e-5, last_epoch=-1)
#     opt = SWA(optimizer, swa_start=100, swa_freq=5, swa_lr=1e-4)
    total_loss = []
    train_loss = []
    best_mlogloss = -np.inf
    best_auc = -np.inf
    best_loss = np.inf
    no_improve = 0
    for epoch in range(epochs):
        model.train()
        if epoch > 2:
            for param in model.named_parameters():
                if param[0] == 'embed.weight':
                    param[1].requires_grad = True
                    break
#         fgm = FGM(model)
        bar = tqdm(train_dataloader)
        for i, (description, label) in enumerate(bar):
            optimizer.zero_grad()
            output = model(description.to(DEVICE), label.to(DEVICE))
            loss = output
            loss.backward()
            train_loss.append(loss.item())
            
#             fgm.attack()
#             loss_adv = model(describe.to(DEVICE), label.to(DEVICE))
#             loss_ad = loss_adv
#             loss_ad.backward()
#             fgm.restore()
            
            scheduler.step(epochs + i / len(train_dataloader))
            optimizer.step()
#             ema.update()
            bar.set_postfix(tloss=np.array(train_loss).mean())
#         opt.swap_swa_sgd()
#         ema.apply_shadow()
        mlogloss, auc, logloss = validation_funtion(model, valid_dataloader, valid_torchdata, 'valid')
#         ema.restore()
        print('train_loss: {:.5f}, mlogloss: {:.5f}, auc: {:.5f}, log_loss: {:.5f}\n'.format(train_loss[-1],mlogloss,auc,logloss))
        logger.info('Epoch:[{}]\t mlogloss={:.5f}\t auc={:.5f}\t log_loss={:.5f}\t'.format(epoch,mlogloss,auc,logloss))
        global model_num
        if early_stop:
            if mlogloss > best_mlogloss:
                best_mlogloss = mlogloss
                best_auc = auc
                best_loss = train_loss[-1]
#                 ema.apply_shadow()
                torch.save(model.state_dict(), '{}_model_{}.bin'.format(NAME, model_num))
#                 ema.restore()
            else:
                no_improve += 1
            if no_improve == early_stop:
                model_num += 1
                break
            if epoch == epochs-1:
                model_num += 1
        else:
            if epoch >= epochs-1:
                torch.save(model.state_dict(), '{}_model_{}.bin'.format(NAME, model_num))
                model_num += 1
    return best_mlogloss, best_auc, best_loss

In [8]:
import logging
def get_logger(filename, verbosity=1, name=None):
    level_dict = {0: logging.DEBUG, 1: logging.INFO, 2: logging.WARNING}
    formatter = logging.Formatter(
        "[%(asctime)s][%(filename)s][line:%(lineno)d][%(levelname)s] %(message)s"
    )
    logger = logging.getLogger(name)
    logger.setLevel(level_dict[verbosity])
    fh = logging.FileHandler(filename, "w")
    fh.setFormatter(formatter)
    logger.addHandler(fh)
    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    logger.removeHandler(sh)
    return logger

In [9]:
FOLD = 10
kf = StratifiedKFold(n_splits=FOLD, shuffle=True, random_state=SEED)
model_num = 1
test_preds_total = collections.defaultdict(list)
logger = get_logger('{}.log'.format(NAME))
best_mlogloss = []
best_auc = []
best_loss = []
for i, (train_index, test_index) in enumerate(kf.split(np.arange(train_data.shape[0]), train_data.label.values)):
    print(str(i+1), '-'*50)
    tra = [train_dataset[index] for index in train_index]
    val = [train_dataset[index] for index in test_index]
    print(len(tra))
    print(len(val))
    train_dataloader, train_torchdata = get_dataloader(tra, mode='train')
    valid_dataloader, valid_torchdata = get_dataloader(val, mode='valid')
    model = HAN(w2v_model.wv.vectors.shape[0]+1,w2v_model.wv.vectors.shape[1]+fasttext_model.wv.vectors.shape[1],embeddings=True)
    model.to(DEVICE)
    mlogloss,auc,loss = train(model,train_dataloader,
                    valid_dataloader,
                    valid_torchdata,
                    epochs=100,
                    early_stop=5)
    torch.cuda.empty_cache()
    best_mlogloss.append(mlogloss)
    best_auc.append(auc)
    best_loss.append(loss)
for i in range(FOLD):
    print('- 第{}折中，best mlogloss: {}   best auc: {}   best loss: {}'.format(i+1, best_mlogloss[i], best_auc[i], best_loss[i]))

- 第1折中，best mlogloss: 0.9763843304205626   best auc: 0.9985805812897178   best loss: 0.3552546501159668
- 第2折中，best mlogloss: 0.9774165140622638   best auc: 0.997005073232342   best loss: 0.3522224426269531
- 第3折中，best mlogloss: 0.9772456548530228   best auc: 0.9980179860212681   best loss: 0.34420496225357056
- 第4折中，best mlogloss: 0.9804633824677317   best auc: 0.9982196812576855   best loss: 0.32889825105667114
- 第5折中，best mlogloss: 0.9783214461075509   best auc: 0.9982687191009312   best loss: 0.36440154910087585
- 第6折中，best mlogloss: 0.9809039671615718   best auc: 0.9970804967779818   best loss: 0.3448050916194916
- 第7折中，best mlogloss: 0.9774368164840502   best auc: 0.9980991573901716   best loss: 0.34063270688056946
- 第8折中，best mlogloss: 0.973417946573173   best auc: 0.9983514048770021   best loss: 0.3517434895038605
- 第9折中，best mlogloss: 0.9746748116688652   best auc: 0.9981369511507947   best loss: 0.3631766140460968
- 第10折中，best mlogloss: 0.9709233344560553   best auc: 0.9974890027782815   best loss: 0.3433622121810913

In [10]:
# model_num = 11
model = HAN(w2v_model.wv.vectors.shape[0]+1,w2v_model.wv.vectors.shape[1]+fasttext_model.wv.vectors.shape[1],embeddings=True)
model.to(DEVICE)
test_preds_total = []
test_dataloader, test_torchdata = get_dataloader(test_dataset, mode='test')
for i in range(1,model_num):
    model.load_state_dict(torch.load('{}_model_{}.bin'.format(NAME, i)))
    test_pred_results = validation_funtion(model, test_dataloader, test_torchdata, 'test')
    test_preds_total.append(test_pred_results)
test_preds_merge = np.sum(test_preds_total, axis=0) / (model_num-1)
pres_fold = [[str(p) for p in li] for li in test_preds_merge]
pres_all = [' '.join(p) for p in pres_fold]
str_w = ''
sub_id = test_data['report_ID'].values
with open('submit.csv','w') as f:
    for i in range(len(sub_id)):
        str_w += sub_id[i] + '|,' + '|' + pres_all[i] + '\n'
    str_w = str_w.strip('\n')
    f.write(str_w)

HBox(children=(FloatProgress(value=0.0, max=3000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))


