# Команда megamen, код обучения модели с 1 головой
Данный код предназначен для обучения одной из нижеуказанных моделей в Kaggle. На тех же данных, что были выданы организаторами, без изменения конфигурации и других параметров, решение запускается на P100 и отрабатывает за ~4 минуты.

### Список допустимых моделей:
1. intfloat/multilingual-e5-large-instruct
2. intfloat/multilingual-e5-large
3. ai-forever/ru-en-RoSBERTa

In [1]:
import os

OUTPUT_DIR = './me5_instruct_39_classes'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

class CFG:
    # параметры тренировки модели
    apex=True
    print_freq=100
    num_workers=8
    model="intfloat/multilingual-e5-large-instruct"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=32
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=["class"]
    seed=42
    n_fold=1
    trn_fold=[0]
    train=True

In [2]:
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
import torch.nn as nn
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
import pickle
import pandas as pd
import numpy as np
import gc
import re
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
def preprocess(df):
    url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
#     df["text"] = df["text"].apply(lambda x: re.sub(url_pattern, '', x)) не помогало улучшить качество
    df["text"] = df["text"].apply(lambda x: " ".join(
        re.findall(r"[а-яА-Я0-9 ёЁ\-\.,?!+a-zA-Z]+", x)))

    return df


def get_detailed_instruct(task_description: str, query: str) -> str:
    # функция преобразования промпта для instruct версий моделей
    return f'Instruct: {task_description}\nQuery: {query}'


def get_score(y_trues, class_predictions):
    # расчет метрик на валидации
    class_predictions = [np.argmax(el) for el in class_predictions]

    class_score = accuracy_score(y_trues[:, 0], class_predictions)
    return class_score


def get_logger(filename=os.path.join(OUTPUT_DIR, 'train')):
    # логгер данных
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=42)

In [4]:
df_1 = pd.read_excel('/kaggle/input/rutube-db/01_bd_rutube.xlsx')
df_2 = pd.read_excel('/kaggle/input/rutube-db/02_real_cases_rutube.xlsx')

In [5]:
df_1.rename(
    columns={
        'Вопрос из БЗ': 'text',
        'Классификатор 2 уровня': 'class'
    },
    inplace=True
)

df_2.rename(
    columns={
        'Вопрос пользователя': 'text',
        'Классификатор 2 уровня': 'class'
    },
    inplace=True
)

In [6]:
# df1_text = pd.DataFrame(df_1[['text', 'class']])
# df2_text = pd.DataFrame(df_2[['text', 'class']])

# overall_df_text = pd.concat([df1_text, df2_text]).reset_index().drop(columns='index')
overall_df_text = pd.DataFrame(df_2[['text', 'class']])

In [7]:
train = preprocess(overall_df_text)

In [8]:
train.head(5)

Unnamed: 0,text,class
0,Здравствуйте! Можно уточнить причины Правилhtt...,Отклонение/блокировка видео
1,"Добрый вечер, какой топ причин блокировки виде...",Отклонение/блокировка видео
2,"Все пишут, что монетизация на рутубе отключает...",Отключение/подключение монетизации
3,Что запрещено в монетизации и что можно выклад...,Отключение/подключение монетизации
4,"Чтобы не отключали монетизацию, надо, чтобы я ...",Отключение/подключение монетизации


In [9]:
print(train.text[1])

Добрый вечер, какой топ причин блокировки видео на рутубе?


In [10]:
df_1.head()

Unnamed: 0,Тема,text,Ответ из БЗ,Классификатор 1 уровня,class
0,Что нельзя публиковать на RUTUBE,Что нельзя публиковать на RUTUBE?,Чужой контент без разрешения автора или правоо...,МОДЕРАЦИЯ,Отклонение/блокировка видео
1,Почему могут отключить монетизацию на видео и ...,Почему могут отключить монетизацию из-за автор...,"Монетизация может отключиться, если на вашем к...",МОНЕТИЗАЦИЯ,Отключение/подключение монетизации
2,Почему могут отключить монетизацию на видео и ...,Почему могут отключить монетизацию из-за искус...,Монетизация на RUTUBE зависит в том числе от к...,МОНЕТИЗАЦИЯ,Отключение/подключение монетизации
3,Почему могут отключить монетизацию на видео и ...,"Для каких статусов доступна монетизация, и поч...","Монетизацию на RUTUBE можно подключить, если в...",МОНЕТИЗАЦИЯ,Отключение/подключение монетизации
4,Авторское право,Какой контент можно использовать для монетизац...,"То, что вы создали сами: видео, которое вы сня...",МОНЕТИЗАЦИЯ,Отключение/подключение монетизации


In [11]:
class_le = LabelEncoder()
class_le.fit(train["class"].tolist())
train["class"] = class_le.transform(train["class"].tolist())
df_1["class"] = class_le.transform(df_1["class"].tolist())

with open(os.path.join(OUTPUT_DIR, "executor_le.pkl"), "wb") as f:
    pickle.dump(class_le, f)

In [12]:
# очень странное разделение данных  - можно было проще
Fold = StratifiedKFold(n_splits=5,
                       shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train["class"].tolist())):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
train['fold'] = train['fold'].apply(lambda x: 1 if x <= 3 else 0)
train.head()

Unnamed: 0,text,class,fold
0,Здравствуйте! Можно уточнить причины Правилhtt...,14,1
1,"Добрый вечер, какой топ причин блокировки виде...",14,0
2,"Все пишут, что монетизация на рутубе отключает...",15,1
3,Что запрещено в монетизации и что можно выклад...,15,1
4,"Чтобы не отключали монетизацию, надо, чтобы я ...",15,1


In [13]:
df_1_cut = df_1[['text', 'class']]
df_1_cut['fold'] = 1

In [14]:
train = pd.concat([train, df_1_cut]).reset_index().drop(columns='index')

In [15]:
# преобразование данных в неоходимый формат в зависимости от формата приема данных модели
if CFG.model == "intfloat/multilingual-e5-large":
    train["text"] = train["text"].apply(lambda x: "query: " + x)
    
if CFG.model == "ai-forever/ru-en-RoSBERTa":
    train["text"] = train["text"].apply(lambda x: "classification: " + x)
    
if CFG.model in ['intfloat/multilingual-e5-large-instruct']:
    task = 'Classify the detailed category of the given user request into one of thirty nine categories'
    train["text"] = train["text"].apply(lambda user_query: get_detailed_instruct(task, user_query))

In [16]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, 'tokenizer'))
CFG.tokenizer = tokenizer

lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2  # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")
CFG.max_len = 512

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

  0%|          | 0/1127 [00:00<?, ?it/s]

max_len: 120


In [17]:
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df["class"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.long)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

In [18]:
train['class'].nunique()

39

In [19]:
# пулинг для me5 моделей
def average_pool(last_hidden_states, attention_mask):
    last_hidden = last_hidden_states.masked_fill(
        ~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# это для ai-forever/ru-en-RoSBERTa
def pool(hidden_state, mask, pooling_method="cls"):
    if pooling_method == "mean":
        s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
        d = mask.sum(axis=1, keepdim=True).float()
        return s / d
    elif pooling_method == "cls":
        return hidden_state[:, 0]


class CustomModel(nn.Module):
    """
    Реализаует класс для "навешивания" головы на модель
    Также настраивает некоторые параметры (такие как dropout) для лучших результатов
    """
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(
                cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        self.fc = nn.Linear(self.config.hidden_size, 39)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(
                mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        
        if self.cfg.model in [
            'intfloat/multilingual-e5-large',
            'intfloat/multilingual-e5-large-instruct'
        ]:
            feature = average_pool(
                outputs.last_hidden_state,
                inputs['attention_mask']
            )
        elif self.cfg.model == 'ai-forever/ru-en-RoSBERTa':
            feature = pool(
                outputs.last_hidden_state,
                inputs['attention_mask'],
                pooling_method="cls"
            )
            
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)

        return output

In [20]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [21]:
def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, label) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        label = label.to(device)

        batch_size = label.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            pred = model(inputs)
            loss = criterion(pred, label)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(
                              step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, label) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        label = label.to(device)

        batch_size = label.size(0)
        with torch.no_grad():
            pred = model(inputs)
            loss = criterion(pred, label)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(pred.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [22]:
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)

    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, os.path.join(OUTPUT_DIR, 'config.pth'))
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr,
                      eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.CrossEntropyLoss()  # МБ добавить веса в лосс

    best_score = -1 * float('inf')

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model,
                            criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(
            valid_loader, model, criterion, device)

        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(
            f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        if best_score < score:
            best_score = score
            LOGGER.info(
                f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                       os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"))

    predictions = torch.load(os.path.join(OUTPUT_DIR, f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth"),
                             map_location=torch.device('cpu'))['predictions']

    valid_folds["pred"] = [np.argmax(el) for el in predictions]

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [23]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        predictions = oof_df["pred"].tolist()
        score = accuracy_score(labels[:, 0], predictions)
        LOGGER.info(f'Score: {score:.4f}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(os.path.join(OUTPUT_DIR, 'oof_df.pkl'))



config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

XLMRobertaConfig {
  "_name_or_path": "intfloat/multilingual-e5-large-instruct",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float16",
  "transformers_version": "4.44.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Epoch: [1][0/30] Elapsed 0m 2s (remain 1m 8s) Loss: 3.6126(3.6126) Grad: 660551.6875  LR: 0.00002000  
Epoch: [1][29/30] Elapsed 0m 30s (remain 0m 0s) Loss: 2.6493(2.9832) Grad: 299913.4062  LR: 0.00001952  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 2.5174(2.5174) 


Epoch 1 - avg_train_loss: 2.9832  avg_val_loss: 2.5914  time: 33s
Epoch 1 - Score: 0.3875
Epoch 1 - Save Best Score: 0.3875 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 2.7937(2.5914) 
Epoch: [2][0/30] Elapsed 0m 1s (remain 0m 34s) Loss: 2.4910(2.4910) Grad: 504169.2188  LR: 0.00001948  
Epoch: [2][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 1.8967(2.0543) Grad: 475999.6562  LR: 0.00001811  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 1.6771(1.6771) 


Epoch 2 - avg_train_loss: 2.0543  avg_val_loss: 1.8725  time: 31s
Epoch 2 - Score: 0.5563
Epoch 2 - Save Best Score: 0.5563 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 2.3231(1.8725) 
Epoch: [3][0/30] Elapsed 0m 1s (remain 0m 29s) Loss: 1.5626(1.5626) Grad: 849876.6875  LR: 0.00001805  
Epoch: [3][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 1.3003(1.2906) Grad: 991613.5625  LR: 0.00001593  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 1.3137(1.3137) 


Epoch 3 - avg_train_loss: 1.2906  avg_val_loss: 1.4229  time: 32s
Epoch 3 - Score: 0.6438
Epoch 3 - Save Best Score: 0.6438 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.7451(1.4229) 
Epoch: [4][0/30] Elapsed 0m 1s (remain 0m 35s) Loss: 0.7283(0.7283) Grad: 900739.6875  LR: 0.00001584  
Epoch: [4][29/30] Elapsed 0m 28s (remain 0m 0s) Loss: 0.6255(0.8147) Grad: 737308.0000  LR: 0.00001317  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 1.0972(1.0972) 


Epoch 4 - avg_train_loss: 0.8147  avg_val_loss: 1.1807  time: 31s
Epoch 4 - Score: 0.7063
Epoch 4 - Save Best Score: 0.7063 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.5559(1.1807) 
Epoch: [5][0/30] Elapsed 0m 1s (remain 0m 39s) Loss: 0.5464(0.5464) Grad: 812442.1875  LR: 0.00001307  
Epoch: [5][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 0.3923(0.5263) Grad: 731698.6875  LR: 0.00001010  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.9768(0.9768) 


Epoch 5 - avg_train_loss: 0.5263  avg_val_loss: 1.0534  time: 32s
Epoch 5 - Score: 0.7063


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.3235(1.0534) 
Epoch: [6][0/30] Elapsed 0m 1s (remain 0m 32s) Loss: 0.3526(0.3526) Grad: 635584.9375  LR: 0.00001000  
Epoch: [6][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1996(0.3478) Grad: 512892.7812  LR: 0.00000703  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.9171(0.9171) 


Epoch 6 - avg_train_loss: 0.3478  avg_val_loss: 0.9690  time: 32s
Epoch 6 - Score: 0.7500
Epoch 6 - Save Best Score: 0.7500 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.1639(0.9690) 
Epoch: [7][0/30] Elapsed 0m 1s (remain 0m 53s) Loss: 0.3042(0.3042) Grad: 582313.9375  LR: 0.00000693  
Epoch: [7][29/30] Elapsed 0m 30s (remain 0m 0s) Loss: 0.1367(0.2331) Grad: 301006.2500  LR: 0.00000424  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.8927(0.8927) 


Epoch 7 - avg_train_loss: 0.2331  avg_val_loss: 0.9190  time: 32s
Epoch 7 - Score: 0.7812
Epoch 7 - Save Best Score: 0.7812 Model


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.1202(0.9190) 
Epoch: [8][0/30] Elapsed 0m 1s (remain 0m 47s) Loss: 0.1689(0.1689) Grad: 371633.0625  LR: 0.00000416  
Epoch: [8][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 0.2014(0.1768) Grad: 524100.7188  LR: 0.00000201  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.8663(0.8663) 


Epoch 8 - avg_train_loss: 0.1768  avg_val_loss: 0.8961  time: 31s
Epoch 8 - Score: 0.7750


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.1006(0.8961) 
Epoch: [9][0/30] Elapsed 0m 1s (remain 0m 31s) Loss: 0.1788(0.1788) Grad: 409492.6250  LR: 0.00000195  
Epoch: [9][29/30] Elapsed 0m 28s (remain 0m 0s) Loss: 0.1386(0.1472) Grad: 328331.2500  LR: 0.00000055  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.8735(0.8735) 


Epoch 9 - avg_train_loss: 0.1472  avg_val_loss: 0.8900  time: 31s
Epoch 9 - Score: 0.7812


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.0879(0.8900) 
Epoch: [10][0/30] Elapsed 0m 1s (remain 0m 31s) Loss: 0.1687(0.1687) Grad: 348084.0000  LR: 0.00000052  
Epoch: [10][29/30] Elapsed 0m 29s (remain 0m 0s) Loss: 0.1162(0.1379) Grad: 287593.2500  LR: 0.00000000  
EVAL: [0/3] Elapsed 0m 0s (remain 0m 1s) Loss: 0.8719(0.8719) 


Epoch 10 - avg_train_loss: 0.1379  avg_val_loss: 0.8909  time: 32s
Epoch 10 - Score: 0.7688


EVAL: [2/3] Elapsed 0m 1s (remain 0m 0s) Loss: 1.0925(0.8909) 


Score: 0.7812
Score: 0.7812
