# README

#Setup

In [1]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install sentence-transformers
!pip install sklearn
!pip install datasets


[0m

In [2]:
import numpy as np
import pandas as pd
import math
import itertools
import random
import torch
import os
import gzip
import json
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from sentence_transformers import SentenceTransformer, util, losses, models
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForMaskedLM, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling, pipeline
from transformers import AdamW, get_linear_schedule_with_warmup, TrainerCallback
from sklearn.model_selection import StratifiedKFold
import shutil
from datasets import load_metric
import gc
gc.enable()
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, BayesianRidge, Perceptron, SGDRegressor

In [18]:
torch.cuda.is_available()


False

In [3]:
#from google.colab import drive
#drive.mount('gdrive')

# Constants

In [4]:
BASE_PATH = '/home/masa1357/gitfile/kaggle_clrp_1st_place_solution/'

In [5]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [6]:
# fine-tuned model paths
# adjust path if you have saved the models in different directories
ALBERT_TRAINED_1 = os.path.join(BASE_PATH, 'models/albert-xxlarge-2-models')#OK
ALBERT_TRAINED_2 = os.path.join(BASE_PATH, 'models/albert-xxlarge-low-lr')#OK
ALBERT_TRAINED_3 = os.path.join(BASE_PATH, 'models/ALBERT_3/albert-xxlarge-all-data')#OK
DEBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/deberta-large')#OK
DEBERTA_TRAINED_2 = os.path.join(BASE_PATH, 'models/deberta-large-low-lr')#OK
DEBERTA_TRAINED_3 = os.path.join(BASE_PATH, 'models/deberta-augmented-continued')#OK
ROBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/roberta-large-two-models')#OK
ELECTRA_TRAINED_1 = os.path.join(BASE_PATH, 'models/electra-large')#OK

# Functions

In [7]:
def train_model(
    model_dir,
    out_dir,
    data,
    data_labels,
    test_data=None,
    test_labels=None,
    do_eval=False,
    do_epoch_eval=False,
    do_save_best=False,
    hyperparams={'bs': 16, 'lr': 1e-4, 'ep': 5, 'bias': False, 'init': None},
    cfg={'num_labels': 1, 'logging_steps': 500, 'is_multilabel': False, 'keep_layers': None}
    ):
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
  
  train_encodings = tokenizer(data, truncation=True, padding=True, max_length=MAX_LENGTH)
  if test_data:
    test_encodings = tokenizer(test_data, truncation=True, padding=True, max_length=MAX_LENGTH)
  

  class LitDataset(torch.utils.data.Dataset):
      def __init__(self, encodings, labels):
          self.encodings = encodings
          self.labels = labels

      def __getitem__(self, idx):
          item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
          item['labels'] = torch.tensor(self.labels[idx])
          return item

      def __len__(self):
          return len(self.labels)

  train_dataset = LitDataset(train_encodings, data_labels)
  if test_data:
    test_dataset = LitDataset(test_encodings, test_labels)
  
  train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=hyperparams['bs'])
  training_steps = len(train_dataloader) * hyperparams['ep'] 
  warmup_steps = math.ceil(training_steps  * 0.06)

  training_args = TrainingArguments(
      output_dir=out_dir,          # output directory
      num_train_epochs=hyperparams['ep'],              # total number of training epochs
      per_device_train_batch_size=hyperparams['bs'],  # batch size per device during training
      per_device_eval_batch_size=1,   # batch size for evaluationing rate scheduler
      logging_dir='/tmp/logs',            # directory for storing logs
      logging_steps=cfg['logging_steps'],
      seed=SEED,
      weight_decay=hyperparams['weight_decay'],
      learning_rate=hyperparams['lr'],
      save_strategy='no'
  )
  config = AutoConfig.from_pretrained(
      model_dir,
      num_labels=cfg['num_labels'],
      hidden_dropout_prob=hyperparams['hidden_dropout'],
      attention_probs_dropout_prob=hyperparams['attention_probs_dropout'])
  model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=cfg['num_labels'])
  if hyperparams['init']:
    model = reinitialize_layers(model, hyperparams['init'])
  model.config = AutoConfig.from_pretrained(model_dir, num_labels=cfg['num_labels'])
  model.num_labels = cfg['num_labels']
  if cfg['keep_layers']:
    new_layers = torch.nn.ModuleList([layer_module for i, layer_module in enumerate(model.base_model.encoder.layer) if i in cfg['keep_layers']])
    model.base_model.encoder.layer = new_layers
    model.config.num_hidden_layers = len(cfg['keep_layers'])

  optimizer = AdamW(model.parameters(), correct_bias=hyperparams['bias'], lr=hyperparams['lr'])
  scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_training_steps=training_steps, num_warmup_steps=warmup_steps)
  device = "cuda:0"
  scores = []
  best_score = 1.0
  metric = load_metric("accuracy")

  class EvalCallback(TrainerCallback):
    def on_log(self, args, state, control, **kwargs):
      if do_save_best:
        model = kwargs['model']
        y_pred = predict_fast(init_model=model, tokenizer=tokenizer, data=test_data, num_labels=cfg['num_labels'], is_multilabel=cfg['is_multilabel'])
        model.train()
        curr_score = rms(test_labels, y_pred) if not cfg['is_multilabel'] else metric.compute(predictions=y_pred, references=test_labels)['accuracy']
        print('Score: ', curr_score)

        if len(scores) == 0 or min(scores) > curr_score:
          print(f'is min {curr_score} is smaller than {scores}')
          best_score = curr_score
          save_dir = os.path.join(out_dir, 'best')
          model.save_pretrained(save_dir)
          tokenizer.save_pretrained(save_dir)
          with open(os.path.join(save_dir, 'hyperparams.txt'), 'w') as f:
            hyperparams['score'] = curr_score
            hyperparams['step'] = state.global_step
            hyperparams['trainset_size'] = len(data_labels)
            f.write(json.dumps(hyperparams))
        scores.append(curr_score)

  trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      optimizers=(optimizer, scheduler),
      callbacks=[EvalCallback]             # evaluation dataset
  )

  trainer.train()

  if not do_save_best:
    model.save_pretrained(out_dir)
    tokenizer.save_pretrained(out_dir)
  print('Training done')

  if do_save_best:
    del model
    gc.collect()
    return min(scores)

In [8]:
def train_cv_v2(model_dir, out_dir, fold_dir, hyperparams, cfg, kfolds=[0, 1, 2, 3, 4, 5], continue_training=False, deduplicate=False, soft_label_model=None):
  scores = []
  for fold in kfolds:
    train_df = pd.read_csv(fold_dir + '/train_fold_' + str(fold) + '.csv')
    val_df = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    if deduplicate:
      train_df = train_df.drop_duplicates(subset=['excerpt'])
    train_tx = [str(t) for t in train_df.excerpt.values]
    train_sc = [float(t) for t in train_df.target.values]
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(t) for t in val_df.target.values]

    model_out_dir = out_dir + '/model_fold_' + str(fold)
    if continue_training:
      final_model_dir = model_dir + '/model_fold_' + str(fold) + '/best'
    else:
      final_model_dir = model_dir
    
    if cfg['soft_labels'] == 'add':
      preds = predict_fast(final_model_dir, train_tx)
      train_tx = train_tx + train_tx
      train_sc = train_sc + preds
    if cfg['soft_labels'] == 'only':
      preds = predict_fast(final_model_dir, train_tx)
      train_tx = train_tx
      train_sc = preds
    if soft_label_model and cfg['soft_labels'] == 'add':
      preds = predict_fast(soft_label_model + '/model_fold_' + str(fold) + '/best', train_tx)
      train_sc = train_sc + preds
      train_tx = train_tx + train_tx
    if soft_label_model and cfg['soft_labels'] == 'only':
      preds = predict_fast(soft_label_model + '/model_fold_' + str(fold) + '/best', train_tx)
      train_sc = preds
      train_tx = train_tx
      
    best_score = train_model(
        model_dir=final_model_dir,
        out_dir=model_out_dir,
        data=train_tx,
        data_labels=train_sc,
        test_data=val_tx,
        test_labels=val_sc,
        do_save_best=True,
        hyperparams=hyperparams,
        cfg=cfg
      )
    scores.append(best_score)
  cv_score = np.mean(scores)
  with open(out_dir + '/eval.txt', 'w') as f:
    f.write('CV score is ' + str(cv_score))

In [9]:
def predict_fast(model_name=None, data=None, init_model=None, tokenizer=None, num_labels=1, is_multilabel=False, output_logits=False, use_softmax=False):
  device = "cuda:0"
  tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name else tokenizer
  config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if model_name else None
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config) if model_name else init_model
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    if not use_softmax:
      logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
    else:
      logits = nn.functional.softmax(outputs.logits, dim=-1).detach().cpu().numpy().squeeze().tolist()
    if is_multilabel and not output_logits:
      logits = np.argmax(logits, axis=-1)
    y_pred.extend(logits)
  del model
  gc.collect()
  return y_pred

In [10]:
def get_oof_predictions(model_dirs, fold_dir, out_dir, kfolds=[0,1,2,3,4,5]):
  df = pd.DataFrame()
  
  for fold in kfolds:
    val_df = pd.read_csv(fold_dir + '/val_fold_' + str(fold) + '.csv')
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(t) for t in val_df.target.values]
    fold_df = pd.DataFrame()
    fold_df['fold'] = [fold for v in val_sc]
    fold_df['excerpt'] = val_tx
    fold_df['target'] = val_sc
    fold_df['id'] = val_df['id']

    for model in model_dirs:
      final_model_dir = model + '/model_fold_' + str(fold) + '/best'
      model_name = model.split('/')[-1]
      preds = predict_fast(final_model_dir, val_tx)
      fold_df[model_name] = preds
    df = df.append(fold_df, ignore_index=True)
  
  df.to_csv(out_dir)  

In [11]:
def train_leaky_ensembler(oof_dir, model_names, out_dir, kfolds=[0,1,2,3,4,5], model_bins=[], clf='ridge', find_opt_avg=False, bin_avg_dir=None, use_postprocessing=False):
  df = pd.read_csv(oof_dir)

  if find_opt_avg:
    msk = np.random.rand(len(df)) < 0.2
    df_test = df[msk].reset_index(drop=True)
    df = df[~msk].reset_index(drop=True)
    
  get_bin_stratified(df, n_splits=6)

  results = []
  if find_opt_avg:
    avg_df = pd.DataFrame()
    avg_df['target'] = [float(f) for f in df_test['target']]
  for fold in kfolds:
    train_df = df.loc[df.fold!=fold].reset_index(drop=True)
    val_df = df.loc[df.fold==fold].reset_index(drop=True)
    
    train_tx = [str(t) for t in train_df.excerpt.values]
    val_tx = [str(t) for t in val_df.excerpt.values]
    val_sc = [float(f) for f in val_df.target.values]
    train_sc = [float(f) for f in train_df.target.values]

    train_predictions = []
    val_predictions = []
    avg_predictions = []

    if len(model_bins) > 0 and not use_postprocessing:
      for model_name in model_bins:
        preds = [json.loads(p) for p in train_df[model_name].values]
        preds_val = [json.loads(p) for p in val_df[model_name].values]
        if bin_avg_dir:
          with open(bin_avg_dir, 'r') as f:
            averages = json.loads(f.read())
          preds = [averages[np.argmax(p)] for p in preds]
          preds_val = [averages[np.argmax(p)] for p in preds_val]

        train_predictions.append(preds)
        val_predictions.append(preds_val)
    
    for model_name in model_names:
      preds = [float(f) for f in train_df[model_name].values]
      train_predictions.append(np.array(preds))
      preds_val = [float(f) for f in val_df[model_name].values]
      val_predictions.append(np.array(preds_val))
      if find_opt_avg:
        preds_avg = [float(f) for f in df_test[model_name].values]
        avg_predictions.append(np.array(preds_avg))
    
    X = np.column_stack(train_predictions)
    
    if clf == 'ridge':
      clf = Ridge(alpha=1.0)
    elif clf == 'linearsvr':
      clf = LinearSVR(max_iter=1000000)
    elif clf == 'svr':
      clf = SVR()
    elif clf == 'kernel':
      clf = KernelRidge()
    elif clf == 'gbr':
      clf = GradientBoostingRegressor()
    elif clf == 'linear':
      clf = LinearRegression()
    elif clf == 'lasso':
      clf = Lasso()
    elif clf == 'bayes':
      clf = BayesianRidge()
    elif clf == 'perceptron':
      clf = SGDRegressor()
    
    clf.fit(X, train_sc)

    final_out = out_dir + '/model_fold_' + str(fold) + '/'
    if not os.path.exists(os.path.dirname(final_out)):
      try:
          os.makedirs(os.path.dirname(final_out))
      except OSError as exc: # Guard against race condition
          if exc.errno != errno.EEXIST:
              raise
    dump(clf, final_out + 'ridge_model.joblib')

    Y = np.column_stack(val_predictions)

    y_preds = clf.predict(Y)
    if use_postprocessing:
      preds_val = [json.loads(p) for p in val_df[model_bins[0]].values]
      with open(bin_avg_dir, 'r') as f:
            averages = json.loads(f.read())
      preds_val_bins = [np.argmax(p) for p in preds_val]
      zipped = list(zip(preds_val_bins, preds_val))
      y_preds = postprocess_predictions(y_preds, zipped, averages)

    score = rms(val_sc, y_preds)
    print('Score is: ', score)
    results.append(score)

    if find_opt_avg:
      Y_test = np.column_stack(avg_predictions)
      y_preds_test = clf.predict(Y_test)
      avg_df['fold_' + str(fold)] = y_preds_test
  
  if find_opt_avg:
    ridge_names = ['fold_' + str(fold) for fold in range(kfolds)]
    print(find_best_stack(avg_df, ridge_names, drop_models=False))

  with open(out_dir + '/eval.txt', 'w') as f:
    mean = np.mean(results)
    print('CV ist: ', mean)
    f.write('CV is: ' + str(mean))

In [12]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [13]:
def rms(y_actual, y_predicted):
  return mean_squared_error(y_actual, y_predicted, squared=False)

# Pretraining models

In [14]:
# Load the pseudo-labeled training data for pretraining models
train_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/predicted/predicted.csv'))
train_tx = [str(t) for t in train_df.excerpt.values]
train_sc = [float(t) for t in train_df.target.values]

In [15]:
# Load the entire training set from the original competition for validation during pretraining
val_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/original/train.csv'))
val_tx = [str(t) for t in train_df.excerpt.values]
val_sc = [float(t) for t in train_df.target.values]

In [16]:
# Train an ALBERT model

model_name = 'albert-xxlarge-v2'
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.01,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.07,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 60,
  'keep_layers': None,
  'soft_labels': None
}

ALBERT_PRETRAINED = os.path.join(BASE_PATH, 'models/albert-xxlarge-no-cv')

In [17]:
train_model(
    model_dir=model_name,
    out_dir=ALBERT_PRETRAINED,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)



Downloading model.safetensors:   0%|          | 0.00/893M [00:00<?, ?B/s]

Some weights of the model checkpoint at albert-xxlarge-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.bias', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-xxlarge-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this 

Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Step,Training Loss


In [None]:
# Train a DEBERTA model
model_name = 'microsoft/deberta-large'
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 20,
  'keep_layers': None,
  'soft_labels': None
}

DEBERTA_PRETRAINED = os.path.join(BASE_PATH, 'models/deberta-large-augmented')

train_model(
    model_dir=model_name,
    out_dir=out_dir,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

https://huggingface.co/microsoft/deberta-large/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp16ymkjh4


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/deberta-large/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/fa4e12e9e6e1a899fe94275a0e60bdc59474baa2cc8e6fa0c207c7d9caaa2598.a39abb1c6179fb264c2db685f9a056b7cb8d4bc48d729888d292a2280debf8e2
creating metadata file for /root/.cache/huggingface/transformers/fa4e12e9e6e1a899fe94275a0e60bdc59474baa2cc8e6fa0c207c7d9caaa2598.a39abb1c6179fb264c2db685f9a056b7cb8d4bc48d729888d292a2280debf8e2
https://huggingface.co/microsoft/deberta-large/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpq8pc4uxt


Downloading:   0%|          | 0.00/475 [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/deberta-large/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/7c686202d9db9b0aee3e649d42a50257a76d278858dc7ad32b886f02cf8303e4.5286a902fea63d3276108ffa66a65e2b4355a7df6cfab5be091bf20f7eae85f8
creating metadata file for /root/.cache/huggingface/transformers/7c686202d9db9b0aee3e649d42a50257a76d278858dc7ad32b886f02cf8303e4.5286a902fea63d3276108ffa66a65e2b4355a7df6cfab5be091bf20f7eae85f8
loading configuration file https://huggingface.co/microsoft/deberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/7c686202d9db9b0aee3e649d42a50257a76d278858dc7ad32b886f02cf8303e4.5286a902fea63d3276108ffa66a65e2b4355a7df6cfab5be091bf20f7eae85f8
Model config DebertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/4614a858d4552a0a399dc77bafbbeb75b20fe49259f690eb561898f8975626fa.e8ad27cc324bb0dc448d4d95f63e48f72688fb318a4c4c3f623485621b0b515c
creating metadata file for /root/.cache/huggingface/transformers/4614a858d4552a0a399dc77bafbbeb75b20fe49259f690eb561898f8975626fa.e8ad27cc324bb0dc448d4d95f63e48f72688fb318a4c4c3f623485621b0b515c
https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpx78gned3


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/7a87aa12b220b9a983b98dbd9ad35624b3fe2ce2e83d1ce621eddcdac1c04654.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/7a87aa12b220b9a983b98dbd9ad35624b3fe2ce2e83d1ce621eddcdac1c04654.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/microsoft/deberta-large/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/4614a858d4552a0a399dc77bafbbeb75b20fe49259f690eb561898f8975626fa.e8ad27cc324bb0dc448d4d95f63e48f72688fb318a4c4c3f623485621b0b515c
loading file https://huggingface.co/microsoft/deberta-large/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/7a87aa12b220b9a983b98dbd9ad35624b3fe2ce2e83d1ce621eddcdac1c04654.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file h

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

storing https://huggingface.co/microsoft/deberta-large/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/236b63dfb5e690fb2e194403aebda39508d60877a8903da58f4fff7a147ec0dd.e8bb754209aab7decd8d3faee51cce4d572131b439d5360c168d43998e3ceb13
creating metadata file for /root/.cache/huggingface/transformers/236b63dfb5e690fb2e194403aebda39508d60877a8903da58f4fff7a147ec0dd.e8bb754209aab7decd8d3faee51cce4d572131b439d5360c168d43998e3ceb13
loading weights file https://huggingface.co/microsoft/deberta-large/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/236b63dfb5e690fb2e194403aebda39508d60877a8903da58f4fff7a147ec0dd.e8bb754209aab7decd8d3faee51cce4d572131b439d5360c168d43998e3ceb13
Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'config', 'l

Step,Training Loss
10,0.0101


1it [00:00, 27.96it/s]
Configuration saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/config.json


Score:  0.03461174666881561
is min 0.03461174666881561 is smaller than []


Model weights saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


1it [00:00, 27.54it/s]
Configuration saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/config.json


Score:  0.033117204904556274
is min 0.033117204904556274 is smaller than [0.03461174666881561]


Model weights saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Lit/Lit_Submission/models/deberta-large-augmented/best/special_tokens_map.json


Training done


0.033117204904556274

In [None]:
# Train a RoBERTa model
model_name = 'roberta-large'
hyperparams = {
  'bs': 8,
  'lr': 1e-5,
  'weight_decay': 0.01,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

ROBERTA_PRETRAINED = os.path.join(BASE_PATH, 'models/roberta-large-augmented')

train_model(
    model_dir=model_name,
    out_dir=out_dir,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

Could not locate the tokenizer configuration file, will try to use the model config instead.
https://huggingface.co/roberta-large/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpa8qjr58p


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
creating metadata file for /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
loading configuration file https://huggingface.co/roberta-large/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/dea67b44b38d504f2523f3ddb6acb601b23d67bee52c942da336fa1283100990.94cae8b3a8dbab1d59b9d4827f7ce79e73124efa6bb970412cd503383a95f373
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initiali

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
creating metadata file for /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
https://huggingface.co/roberta-large/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpht4u0dqe


Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/roberta-large/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp6eu4fwib


Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
creating metadata file for /root/.cache/huggingface/transformers/e16a2590deb9e6d73711d6e05bf27d832fa8c1162d807222e043ca650a556964.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-large/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/7c1ba2435b05451bc3b4da073c8dec9630b22024a65f6c41053caccf2880eb8f.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-large/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/20b5a00a80e27ae9accbe25672aba42ad2d4d4cb2c4b9359b50ca8e34e107d6d.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/robe

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

storing https://huggingface.co/roberta-large/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
creating metadata file for /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
loading weights file https://huggingface.co/roberta-large/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/8e36ec2f5052bec1e79e139b84c2c3089cb647694ba0f4f634fec7b8258f7c89.c43841d8c5cd23c435408295164cda9525270aa42cd0cc9200911570c0342352
Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight'

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


1it [00:00, 48.35it/s]
Configuration saved in gdrive/MyDrive/Lit/Lit_Submission/models/roberta-large-augmented/best/config.json


Score:  0.028184428811073303
is min 0.028184428811073303 is smaller than []


Model weights saved in gdrive/MyDrive/Lit/Lit_Submission/models/roberta-large-augmented/best/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Lit/Lit_Submission/models/roberta-large-augmented/best/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Lit/Lit_Submission/models/roberta-large-augmented/best/special_tokens_map.json


Training done


0.028184428811073303

In [None]:
# Train an ELECTRA model
model_name = 'google/electra-large-discriminator'
hyperparams = {
  'bs': 4,
  'lr': 8e-6,
  'weight_decay': 0.1,
  'ep': 7,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

ELECTRA_PRETRAINED = os.path.join(BASE_PATH, 'models/electra-large-augmented')

train_model(
    model_dir=model_name,
    out_dir=ELECTRA_PRETRAINED,
    data=train_tx,
    data_labels=train_sc,
    test_data=val_tx,
    test_labels=val_sc,
    do_save_best=True,
    hyperparams=hyperparams,
    cfg=cfg
)

https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp3n2ytear


Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

storing https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/26ad81c46898598ce9aed0b02fd3c9175a28daa30317e4f1980b5e871d823b67.4f2213f5603276adf12967b32e4444c0f187f34ca4f8b22a65f03e13514589e9
creating metadata file for /root/.cache/huggingface/transformers/26ad81c46898598ce9aed0b02fd3c9175a28daa30317e4f1980b5e871d823b67.4f2213f5603276adf12967b32e4444c0f187f34ca4f8b22a65f03e13514589e9
https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpii3zb0or


Downloading:   0%|          | 0.00/668 [00:00<?, ?B/s]

storing https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/344f5be314c0b91e28096c6730a1a43d61ba11aee91fd8ff026aba39138181d1.c4309b08c8b9d0909e488ef6b4cefe6a11ebc271247617cbdbb73361b191cc33
creating metadata file for /root/.cache/huggingface/transformers/344f5be314c0b91e28096c6730a1a43d61ba11aee91fd8ff026aba39138181d1.c4309b08c8b9d0909e488ef6b4cefe6a11ebc271247617cbdbb73361b191cc33
loading configuration file https://huggingface.co/google/electra-large-discriminator/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/344f5be314c0b91e28096c6730a1a43d61ba11aee91fd8ff026aba39138181d1.c4309b08c8b9d0909e488ef6b4cefe6a11ebc271247617cbdbb73361b191cc33
Model config ElectraConfig {
  "architectures": [
    "ElectraForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_ra

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

storing https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/48a62a60c85c63546f3623e592c2ddfd0628ed7749e6d503a11eb80cb04fc19c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/48a62a60c85c63546f3623e592c2ddfd0628ed7749e6d503a11eb80cb04fc19c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpdo4t0sba


Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

storing https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/ed8095412008e8a8159d4bbdcecd02e5e72b79a1fc7dbfdc32e6aef638d4b9a9.65c74b3f0086fae55b99a8c9173a6739a53ae5ae0441c0811095141532f33ff8
creating metadata file for /root/.cache/huggingface/transformers/ed8095412008e8a8159d4bbdcecd02e5e72b79a1fc7dbfdc32e6aef638d4b9a9.65c74b3f0086fae55b99a8c9173a6739a53ae5ae0441c0811095141532f33ff8
loading file https://huggingface.co/google/electra-large-discriminator/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/48a62a60c85c63546f3623e592c2ddfd0628ed7749e6d503a11eb80cb04fc19c.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/google/electra-large-discriminator/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/ed8095412008e8a8159d4bbdcecd02e5e72b79a1fc7dbfdc32e6aef638d4b9a9.65c74b3f0086fae55b99a8c9173a6739a53ae5a

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

storing https://huggingface.co/google/electra-large-discriminator/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/6a9790a4cce0d5f0f7d5c78b57955d681fe9cb564edc75aab3733c5ba3a5550d.a2ee8c7426aca3bd41c92ad0b3e07d731d9bf61c950403e6a82b1d566b8923db
creating metadata file for /root/.cache/huggingface/transformers/6a9790a4cce0d5f0f7d5c78b57955d681fe9cb564edc75aab3733c5ba3a5550d.a2ee8c7426aca3bd41c92ad0b3e07d731d9bf61c950403e6a82b1d566b8923db
loading weights file https://huggingface.co/google/electra-large-discriminator/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/6a9790a4cce0d5f0f7d5c78b57955d681fe9cb564edc75aab3733c5ba3a5550d.a2ee8c7426aca3bd41c92ad0b3e07d731d9bf61c950403e6a82b1d566b8923db
Some weights of the model checkpoint at google/electra-large-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminat

Step,Training Loss
10,0.0059


1it [00:00, 43.78it/s]
Configuration saved in gdrive/MyDrive/Lit/Lit_Submission/models/electra-large-augmented/best/config.json


Score:  0.03945397585630417
is min 0.03945397585630417 is smaller than []


Model weights saved in gdrive/MyDrive/Lit/Lit_Submission/models/electra-large-augmented/best/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Lit/Lit_Submission/models/electra-large-augmented/best/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Lit/Lit_Submission/models/electra-large-augmented/best/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


1it [00:00, 43.76it/s]

Score:  0.04779457300901413





Training done


0.03945397585630417

# Training models

In total, I trained 3 deberta-large, 1 roberta-large, 3 albert-xxlarge and 1 electra-large model for my winning submission.



In [None]:
# Training the ALBERT models

In [None]:
# albert 1
model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.01,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.07,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ALBERT_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# albert 2
model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 7e-6,
  'weight_decay': 0.07,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ALBERT_TRAINED_2

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# albert 3
# albert 3 is special it is trained on all training data without evaluation.
model_name = os.path.join(ALBERT_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 600,
  'keep_layers': None,
  'soft_labels': None
}

train_df = pd.read_csv(os.path.join(BASE_PATH, 'data/training/original/train.csv'))
train_tx = [str(t) for t in train_df.excerpt.values]
train_sc = [float(t) for t in train_df.target.values]

out_dir = ALBERT_TRAINED_3


train_model(
   model_dir=model_name,
   out_dir=out_dir,
   data=train_tx,
   data_labels=train_sc,
   hyperparams=hyperparams,
   cfg=cfg
)


Didn't find file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/added_tokens.json. We won't load it.
loading file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/spiece.model
loading file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/tokenizer.json
loading file None
loading file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/special_tokens_map.json
loading file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/tokenizer_config.json
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-no-cv/best/config.json
Model config AlbertConfig {
  "architectures": [
    "AlbertForSe

Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Configuration saved in gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-all-data/config.json
Model weights saved in gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-all-data/pytorch_model.bin
tokenizer config file saved in gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-all-data/tokenizer_config.json
Special tokens file saved in gdrive/MyDrive/Lit/Lit_Submission/models/albert-xxlarge-all-data/special_tokens_map.json


Training done


In [None]:
# Training the deberta models

In [None]:
# deberta 1
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# deberta 2
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 7e-6,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_2

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# deberta 3
# This deberta model was trained on data sampled using bootstrapping instead of cross validation
# Only models trained on 2 folds/bags were used in the final submission
model_name = os.path.join(DEBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 9e-6,
  'weight_decay': 0.08,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = DEBERTA_TRAINED_3

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg,
    kfolds=[0,1]
)

In [None]:
# Training the ELECTRA model

In [None]:
# electra 1
model_name = os.path.join(ELECTRA_PRETRAINED, 'best')
hyperparams = {
  'bs': 3,
  'lr': 8e-6,
  'weight_decay': 0.1,
  'ep': 5,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ELECTRA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

In [None]:
# Training the RoBERTa model

In [None]:
# roberta 1
model_name = os.path.join(ROBERTA_PRETRAINED, 'best')
hyperparams = {
  'bs': 8,
  'lr': 1e-5,
  'weight_decay': 0.1,
  'ep': 4,
  'bias': True,
  'init': None,
  'hidden_dropout': 0.1,
  'attention_probs_dropout': 0.1
}
cfg = {
  'num_labels': 1,
  'is_multilabel': False,
  'logging_steps': 10,
  'keep_layers': None,
  'soft_labels': None
}

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = ROBERTA_TRAINED_1

train_cv_v2(
    model_dir=model_name,
    out_dir=out_dir,
    fold_dir=fold_dir,
    hyperparams=hyperparams,
    cfg=cfg
)

# Stacking

In [None]:
model_dirs = [
    ALBERT_TRAINED_1,
    DEBERTA_TRAINED_1,
    ALBERT_TRAINED_2,
    DEBERTA_TRAINED_1,
    ROBERTA_TRAINED_1,
    ELECTRA_TRAINED_1
]

fold_dir = os.path.join(BASE_PATH, 'data/training/cv')
out_dir = os.path.join(BASE_PATH, 'data/training/oof')

In [None]:
get_oof_predictions(model_dirs=model_dirs, fold_dir=fold_dir, out_dir=out_dir)

In [None]:
model_names_ensemble_1 = [
    ALBERT_TRAINED_1.split('/')[-1],
    DEBERTA_TRAINED_1.split('/')[-1],
    ALBERT_TRAINED_2.split('/')[-1],
    DEBERTA_TRAINED_1.split('/')[-1],
    ROBERTA_TRAINED_1.split('/')[-1],
    ELECTRA_TRAINED_1.split('/')[-1],      
]

model_names_ensemble_2 = model_names_ensemble_1[:-1]

oof_dir = os.path.join(BASE_PATH, 'data/training/oof')

out_dir_ensemble_1 = os.path.join(BASE_PATH, 'models/electra-larger-ensemble')
out_dir_ensemble_2 = os.path.join(BASE_PATH, 'models/huge-ensemble')

In [None]:
# train ensemble 1
train_leaky_ensembler(oof_dir=oof_dir, model_names=model_names_ensemble_1, out_dir=out_dir_ensemble_1)

In [None]:
# train ensemble 2
train_leaky_ensembler(oof_dir=oof_dir, model_names=model_names_ensemble_2, out_dir=out_dir_ensemble_2)

You have finished training the models.