In [None]:
import os
import sys
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers as T
from sklearn.metrics import fbeta_score
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader, Dataset

from transformers import logging
logging.set_verbosity_error()

# 前処理

## 訓練データと予測データのチェック

In [None]:
device = torch.device("cuda")

In [None]:
#Set the seed value to make the result reproducible
seed=42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

In [None]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [None]:
#fillnan
train['abstract'].fillna(value='0', inplace = True)
test['abstract'].fillna(value='0', inplace = True)

#title + abstract
train["text"] = train["title"] + train["abstract"]
train = train.drop(['abstract', 'title'], axis=1)
test["text"] = test["title"] + test["abstract"]
test = test.drop(['abstract', 'title'], axis=1)

In [None]:
train

In [None]:
test

## 阈值のチェック

In [None]:
border = len(train[train["judgement"] == 1]) / len(train["judgement"])
print(border)
border = 0.02

# 訓練と予測

## 訓練

In [None]:
k_fold = 5
Fold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=seed)

for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
    train.loc[val_index, "fold"] = int(n)
    
train["fold"] = train["fold"].astype(np.uint8)
    
train

In [None]:
class BaseDataset(Dataset):
    def __init__(self, data, model_name, include_labels=True):
        tokenizer = T.BertTokenizer.from_pretrained(model_name)

        self.data = data
        self.include_labels = include_labels
        self.text = data["text"].tolist()
        
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = 512,
            truncation = True,
            return_attention_mask=True)
        if self.include_labels:
            self.labels = data["judgement"].values


    def __getitem__(self, idx):
        input_ids = torch.tensor(self.encoded['input_ids'][idx])
        attention_mask = torch.tensor(self.encoded['attention_mask'][idx])        
        if self.include_labels:
            label = torch.tensor(self.labels[idx]).float()
            return input_ids, attention_mask, label
        return input_ids, attention_mask
            
        
    def __len__(self):
        return len(self.data) 

In [None]:
class BaseModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.model = T.BertForSequenceClassification.from_pretrained(model_name, num_labels=1)
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, input_ids, attention_mask):
        out = self.model(input_ids=input_ids, attention_mask=attention_mask)
        out = self.sigmoid(out.logits).squeeze()
        return out

In [None]:
def train_loop(train, fold,trainepoch,batchsize):

    print(f"========== fold: {fold} training ==========")
       
    # ====================================================
    # Data Loader
    # ===================================================
    trn_idx = train[train["fold"] != fold].index
    val_idx = train[train["fold"] == fold].index
    
    train_folds = train.loc[trn_idx].reset_index(drop=True)
    valid_folds = train.loc[val_idx].reset_index(drop=True)
    
    train_dataset = BaseDataset(train_folds, "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    valid_dataset = BaseDataset(valid_folds, "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

    train_loader = DataLoader(
        train_dataset,
        batch_size=batchsize,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=batchsize,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
    )
       
    # ====================================================
    # Model
    # ====================================================
    model = BaseModel("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
    model.to(device)

    optimizer = T.AdamW(model.parameters(),
                        lr=2e-5,)

    criterion = nn.BCELoss()
    
    # ====================================================
    # Loop
    # ====================================================
    best_score = -1
    for epoch in range(trainepoch):       
        #train
        model.train()
        for step, (input_ids, attention_mask, labels) in enumerate(train_loader):           
            optimizer.zero_grad()     
            #to device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)           
            batch_size = labels.size(0)
            # compute loss
            y_preds = model(input_ids, attention_mask)           
            loss = criterion(y_preds, labels)
            loss.backward()
            optimizer.step()           
                        
        # eval
        model.eval()
        preds = []
        for step, (input_ids, attention_mask, labels) in enumerate(valid_loader):
            #to device
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            batch_size = labels.size(0)
            # compute loss
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            loss = criterion(y_preds, labels)
            preds.append(y_preds.to("cpu").numpy())            
        
        preds = np.concatenate(preds)
        valid_labels = valid_folds["judgement"].values
        score = fbeta_score(valid_labels, np.where(preds < border, 0, 1), beta=7.0)
        print(f"Epoch {epoch+1} -Score: {score:.4f}")    
        
        if score > best_score:
            best_score = score
            torch.save({"model": model.state_dict(), "preds": preds}, f"./PubMedBERT_base_uncased_fold{fold}_best.pth")          
    print(f"Best Score: {best_score:.4f}")

    return best_score

In [None]:
def predict(batchsize): 
    predictions = []
    test_dataset = BaseDataset(test, "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", include_labels=False)   
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batchsize,
        shuffle=False,
        num_workers=0,
        pin_memory=True,
        drop_last=False,
    )

    for fold in range(k_fold):
        print(f"==========fold: {fold} predict ==========")
        model = BaseModel("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")
        model.to(device) 
        
        #load best parameters
        model.load_state_dict(torch.load(f"./PubMedBERT_base_uncased_fold{fold}_best.pth")["model"])            
        model.eval()
        preds = []
    
        for i, (input_ids, attention_mask) in enumerate(test_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            with torch.no_grad():
                y_preds = model(input_ids, attention_mask)
            preds.append(y_preds.to("cpu").numpy())
            
        preds = np.concatenate(preds)
        predictions.append(preds)
        
    predictions = np.mean(predictions, axis=0)

    return predictions

In [None]:
batchsize = 3
trainepoch = 10

scores = []
for fold in range(k_fold):
    score = train_loop(train, fold,trainepoch,batchsize)
    scores.append(score)
print(f"========== CV ==========")
print(f"SCORE: {float(sum(scores)/len(scores)):<.5f}")

## 予測

In [None]:
batchsize = 3
predictions = predict(batchsize)

# 半教師あり学習を使う

## 疑似ラベル

In [None]:
preds = np.where(predictions < border, 0, 1)
sub = pd.read_csv("./data/sample_submit.csv", header=None)
sub.columns = ["id","judgement"]

In [None]:
df2 = pd.DataFrame(columns=("id", "judgement", "text",))
df2["id"] = sub["id"]
df2["judgement"] = preds
df2["text"] = test["text"]

df2["fold"] = "000"
df2

In [None]:
df2["judgement"].value_counts()

## 訓練データの"fold"値をランダムで更新

In [None]:
k_fold = 5
Fold = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["judgement"])):
    train.loc[val_index, "fold"] = int(n)
train["fold"] = train["fold"].astype(np.uint8)
    
train

## 疑似ラベル付きテストデータを訓練データと混合

In [None]:
train = pd.concat([train,df2])
train = train.reset_index(drop=True)
train

## 訓練

In [None]:
batchsize = 3
trainepoch = 10

scores = []
for fold in range(k_fold):
    score = train_loop(train,fold,trainepoch,batchsize)
    scores.append(score)
print(f"========== CV ==========")
print(f"SCORE: {float(sum(scores)/len(scores)):<.5f}")

## 预测

In [None]:
batchsize = 3
predictions = predict(batchsize)

# 出力

In [None]:
preds = np.where(predictions < border, 0, 1)
sub = pd.read_csv("./data/sample_submit.csv", header=None)
sub.columns = ["id", "judgement"]
sub["judgement"] = preds
sub.to_csv("./submission.csv", index=False, header=False)