In [1]:
!nvidia-smi

Wed Feb  9 12:22:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.86       Driver Version: 470.86       CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  On   | 00000000:01:00.0 Off |                  N/A |
| 21%   32C    P8    13W / 125W |      1MiB /  7981MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  On   | 00000000:02:00.0 Off |                  N/A |
| 20%   31C    P8     1W / 125W |      1MiB /  7982MiB |      0%      Default |
|       

In [2]:
import os
import math
import random
import time
from tqdm.notebook import tqdm
from datetime import datetime as dt

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error

import category_encoders as ce
import joblib

import gc
gc.enable()

  from pandas import Int64Index as NumericIndex


## パラメーター

In [3]:
NUM_FOLDS = 5
NUM_EPOCHS = 10
BATCH_SIZE = 32
MAX_LEN = 160
HIDDEN_SIZE = 768
MODEL_NAME = 'microsoft/deberta-base'
SEED = 3655
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# path
TRAIN_FILE = '../../../preprocess/preprocess_v4/preprocess_train.csv'
TEST_FILE = '../../../preprocess/preprocess_v4/preprocess_test.csv'
RESULT_DIR = './'
MODELS_DIR = os.path.join(RESULT_DIR, 'models')

os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

In [4]:
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

set_random_seed(SEED)

## Dataset

In [5]:
# make cv 
def make_folded_df(csv_file, num_splits=5):
    '''
    csvをロードし、交差検証のfoldの数字のカラムを追加する。
    また、targetのカラムをlabelsに変更
    
    Parameters
    -----------------
    csv_file: str
        loadしたいcsvのファイルパス
    num_split: int
        交差検証の分割数
        
    Returns
    -----------------
    df: DataFrame
        'kfold'のカラムが追加されたtrainのデータフレーム
    '''
    df = pd.read_csv(csv_file)
    df['kfold'] = np.nan
    df['target'] = df['LOAN_AMOUNT'] / 25

    kfold = KFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(kfold.split(df)):
        df.loc[valid_indexes, 'kfold'] = fold
    return df

In [6]:
class KivaDataset(Dataset):
    def __init__(self, df, MODEL_NAME, target_columns=None, inference_only=False):
        super().__init__()
        
        self.df = df
        self.inference_only = inference_only
        self.text = df['clean_DESCRIPTION_TRANSLATED'].tolist()
        self.sector_name = df['SECTOR_NAME'].tolist()
        self.contry_code = df['COUNTRY_CODE'].tolist()
        
        if not self.inference_only:
            self.target = torch.tensor(df[target_columns].values, dtype=torch.float32)        

        # tokenizerのロード
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        sector_name = torch.tensor(self.sector_name[index])
        contry_code =  torch.tensor(self.contry_code[index])
        
        if self.inference_only:
            return (input_ids, attention_mask, sector_name, contry_code)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, sector_name, contry_code, target)

## Model

In [7]:
class ClsPoolingModel(nn.Module):
    
    def __init__(self, MODEL_NAME):
        super().__init__()
        
        config = AutoConfig.from_pretrained(MODEL_NAME)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})
        
        # transformerモデル
        self.model = AutoModel.from_pretrained(MODEL_NAME, config=config)
        self.layer_norm = nn.LayerNorm(HIDDEN_SIZE)
        self.linear1 = nn.Linear(HIDDEN_SIZE, 512)
        self.LeakyReLU = nn.LeakyReLU()
        
        # category
        self.embed1 = nn.Embedding(11, 10)
        self.embed2 = nn.Embedding(32, 30)
        self.embed_linear = nn.Linear(40, 30)
        
        self.merge_linear = nn.Linear(512+30, 256)
        self.last_linear = nn.Linear(256, 1)
        
    def forward(self, input_ids, attention_mask, sector_name, contry_code):
        # transformerモデル
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        cls_embeddings = last_hidden_state[:, 0, :]
        cls_embeddings = self.layer_norm(cls_embeddings)
        logits = self.LeakyReLU(self.linear1(cls_embeddings))
        
        # category_feat
        embed1 = self.embed1(sector_name)
        embed2 = self.embed2(contry_code)
        embed_ountput = self.LeakyReLU(self.embed_linear(torch.cat((embed1, embed2), dim=1)))

        logits = self.LeakyReLU(self.merge_linear(torch.cat((logits, embed_ountput), dim=1)))
        
        logits = self.last_linear(logits)
        
        preds = logits.squeeze(-1).squeeze(-1)
        return preds

## predict

In [8]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, sector_name, contry_code) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            sector_name = sector_name.to(DEVICE)
            contry_code = contry_code.to(DEVICE)
                        
            pred = model(input_ids, attention_mask, sector_name, contry_code)                       

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

## train_function, eval_function

In [9]:
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    '''
    1epochのtrainの処理。
    
    Parameters
    -----------------
    dataloder: ???
        データローダー
    model: ???
        定義した訓練するモデル
    criterion: function
        損失関数
    
    '''

    model.train()
    total_loss = 0

    progress = tqdm(dataloader, total=len(dataloader))

    for i, (input_ids, attention_mask, sector_name, contry_code, target) in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")
        input_ids = input_ids.to(DEVICE)
        attention_mask = attention_mask.to(DEVICE)
        sector_name = sector_name.to(DEVICE)
        contry_code = contry_code.to(DEVICE)
        target = target.to(DEVICE) 

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, sector_name, contry_code)
        outputs = outputs.view(outputs.size()[0])
        del input_ids, attention_mask, sector_name, contry_code
        loss = criterion(outputs, target)  # 損失を計算

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss, target, outputs

        progress.set_postfix(loss=total_loss/(i+1))

    train_loss = total_loss / len(dataloader)

    return train_loss

def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))

        for i, (input_ids, attention_mask, sector_name, contry_code, target) in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
            sector_name = sector_name.to(DEVICE)
            contry_code = contry_code.to(DEVICE)
            target = target.to(DEVICE) 

            outputs = model(input_ids, attention_mask, sector_name, contry_code)
            outputs = outputs.view(outputs.size()[0])
            del input_ids, attention_mask, sector_name, contry_code
            loss = criterion(outputs, target)
            all_preds += outputs.tolist()
            del outputs

            total_loss += loss.item()
            del loss, target
            progress.set_postfix(loss=total_loss/(i+1))
    valid_loss = total_loss / len(dataloader)

    return valid_loss, all_preds

## trainner

In [10]:
def trainer(fold, df, oof):
    train_df = df.query('kfold != @fold').reset_index(drop=True)
    valid_df = df.query('kfold == @fold').reset_index(drop=True)

    train_dataset = KivaDataset(df=train_df, MODEL_NAME=MODEL_NAME, target_columns='target')    
    val_dataset = KivaDataset(df=valid_df, MODEL_NAME=MODEL_NAME, target_columns='target')

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=0)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=0)

    model = ClsPoolingModel(MODEL_NAME)
    model = model.to(DEVICE)
    model = torch.nn.DataParallel(model) # make parallel
    torch.backends.cudnn.benchmark = True


    criterion = nn.HuberLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = get_cosine_schedule_with_warmup(optimizer,
                                                num_training_steps=NUM_EPOCHS * len(train_loader),
                                                num_warmup_steps=50)

    train_losses = []
    valid_losses = []
    best_loss = np.inf
    best_preds = []

    for epoch in range(NUM_EPOCHS):#各エポックでの学習
        train_loss = train_fn(train_loader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_preds = eval_fn(val_loader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}", end="")

        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        if valid_loss < best_loss:
            best_loss = valid_loss
            best_preds = valid_preds
            print("model saving!", end="")
            torch.save(model.state_dict(), os.path.join(MODELS_DIR, f"best_deberta-base_{fold}.pth"))
        print("\n")

    oof[list(df[df.kfold == fold].index)] = best_preds
    return best_loss, oof

## main

In [11]:
def main():
    # training
    df = make_folded_df(TRAIN_FILE, NUM_FOLDS)
    best_losses = []
    oof = np.zeros(df.shape[0])
    oof[:] = np.nan
    for fold in range(NUM_FOLDS):
        print(f"fold {fold}", "="*80)
        best_loss, oof = trainer(fold, df, oof)
        best_losses.append(best_loss)
        print(f"<fold={fold}> best score: {best_loss}\n")

    cv = sum(best_losses) / len(best_losses)
    print(f"CV: {cv}")

    # predict
    print('start predict')
    test_df = pd.read_csv(TEST_FILE)
    test_dataset =  KivaDataset(df=test_df, MODEL_NAME=MODEL_NAME, inference_only=True)
    test_loader =  DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              drop_last=False, shuffle=False, num_workers=2)

    predictions = np.zeros((len(test_df), NUM_FOLDS))

    for fold in range(NUM_FOLDS):
        print(f"fold {fold}", "="*80)
        model = ClsPoolingModel(MODEL_NAME)
        model = torch.nn.DataParallel(model) # make parallel
        torch.backends.cudnn.benchmark = True
        model_path = os.path.join(MODELS_DIR, 'best_deberta-base_'+str(fold)+'.pth')
        model.load_state_dict(torch.load(model_path))
        model.to(DEVICE)

        predictions[:, fold] = predict(model, test_loader)*25
        
    oof_df = pd.DataFrame()
    sub_df = pd.DataFrame()

    oof_df['LOAN_ID'] = df['LOAN_ID']
    sub_df['LOAN_ID'] = test_df['LOAN_ID']

    oof_df['pred'] = oof*25
    sub_df['LOAN_AMOUNT'] = np.median(predictions, axis=1)
    sub_df_not_postprocess = sub_df.copy(deep=True)

    # post-process
    loan_amount_unique = np.array(list(range(25, 10001, 25)))
    oof_df['pred_post_process'] = oof_df['pred'].map(lambda x: loan_amount_unique[np.abs(loan_amount_unique-x).argmin()])
    sub_df['LOAN_AMOUNT'] = sub_df['LOAN_AMOUNT'].map(lambda x: loan_amount_unique[np.abs(loan_amount_unique-x).argmin()])

    oof_cv = mean_absolute_error(df['LOAN_AMOUNT'], oof_df['pred'])
    oof_cv_postprocess = mean_absolute_error(df['LOAN_AMOUNT'], oof_df['pred_post_process'])

    print(f'oof_cv:{oof_cv}')
    print(f'oof_cv_postprocess:{oof_cv_postprocess}')

    # save_pred_value
    file_template = '{score:.6f}_{model_key}_cv{fold}_{timestamp}'

    # save test submit
    file_stem = file_template.format(
            score=oof_cv_postprocess,
            model_key='deberta-base',
            fold=NUM_FOLDS,
            timestamp=dt.now().strftime('%Y-%m-%d-%H-%M'))

    oof_filename = 'oof_{}.csv'.format(file_stem)
    sub_filename = 'subm_{}.csv'.format(file_stem)

    oof_df.to_csv(os.path.join(RESULT_DIR, oof_filename), index=False)
    sub_df.to_csv(os.path.join(RESULT_DIR, sub_filename), index=False)
    sub_df_not_postprocess.to_csv(os.path.join(RESULT_DIR, 'not_postprocess_' + sub_filename), index=False)

    # save log
    lines = ""
    for i, loss in enumerate(best_losses):
        line = f"fold={i}: {loss})\n"
        lines += line
    lines += f"CV    : {oof_cv}"
    lines += f"CV_postprocess    : {oof_cv_postprocess}"
    with open(os.path.join(RESULT_DIR, "deberta_base_result.txt"), mode='w') as f:
        f.write(lines)

if __name__ == '__main__':
    main()



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 10.518209205199875model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.843547246502077model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.317338984400921model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.23530499029076model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.059055216049355model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.178465014372524



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.06313439509497



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.007764399991144model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.036790416244868



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.028802242045227

<fold=0> best score: 9.007764399991144



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 10.617001102601167model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.753505977774251model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.276235575016242model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.135068614512107model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.915901050467417model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.900389214530717model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.936000631486055



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.898684825663391model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.946255911878863



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.940776688831283

<fold=1> best score: 8.898684825663391



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 10.006811161174875model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.100818161789046model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.886458595245816model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.61284433939411model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.664112574164797



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.49009469219348model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.480551939712097model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.41195363129918model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.415222058154239



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.417690844126632

<fold=2> best score: 8.41195363129918



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 10.110468228518858model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.127721662070414model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.717711983127895model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.565112737765872model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.517078541414959model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.405480384409115model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.352980754421388model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.363105322976455



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.384190304684347



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.369718311129434

<fold=3> best score: 8.352980754421388



Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 10.43759165837343model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 9.373277386516698model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.794031675976754model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.71677397679531model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.652304580457992model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.529808585990331model saving!



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.556668220175961



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.552444237126151



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.551626990012656



  0%|          | 0/2283 [00:00<?, ?it/s]

  0%|          | 0/571 [00:00<?, ?it/s]

Loss: 8.556952743730696

<fold=4> best score: 8.529808585990331

CV: 8.640238439473087
start predict


Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


oof_cv:225.9136319855326
oof_cv_postprocess:225.06569367041484


FileNotFoundError: [Errno 2] No such file or directory: './microsoft/deberta-base_result.txt'