# README

#Setup

In [None]:
!pip install torch
!pip install transformers
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install datasets


Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 12.9 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 49.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 63.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully 

In [None]:
import numpy as np
import pandas as pd
import math
import itertools
import random
import torch
import os
import gzip
import json
from tqdm import tqdm
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForMaskedLM, DataCollatorForWholeWordMask, DataCollatorForLanguageModeling, pipeline
from transformers import AdamW, get_linear_schedule_with_warmup, TrainerCallback
from sklearn.model_selection import StratifiedKFold
import shutil
from datasets import load_metric
import gc
gc.enable()
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import Lasso, BayesianRidge, Perceptron, SGDRegressor

In [None]:
from google.colab import drive
drive.mount('gdrive')

Mounted at gdrive


# Constants

In [None]:
BASE_PATH = 'gdrive/MyDrive/Lit/Lit_Submission'

In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 28
seed_everything(seed=SEED)
MAX_LENGTH = 256

In [None]:
# fine-tuned model paths
# adjust path if you have saved the models in different directories
ALBERT_TRAINED_1 = os.path.join(BASE_PATH, 'models/albert-xxlarge-no-cv-continued')
ALBERT_TRAINED_2 = os.path.join(BASE_PATH, 'models/albert-xxlarge-no-cv-continued-low-lr')
ALBERT_TRAINED_3 = os.path.join(BASE_PATH, 'models/albert-xxlarge-all-data')
DEBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/deberta-large-augmented-continued')
DEBERTA_TRAINED_2 = os.path.join(BASE_PATH, 'models/deberta-large-augmented-continued-low-lr')
DEBERTA_TRAINED_3 = os.path.join(BASE_PATH, 'models/deberta-augmented-continued')
ROBERTA_TRAINED_1 = os.path.join(BASE_PATH, 'models/roberta-large-2-models')
ELECTRA_TRAINED_1 = os.path.join(BASE_PATH, 'models/electra-large-continued')

In [None]:
# ensemble model paths
RIDGE_ENSEMBLE_1 = os.path.join(BASE_PATH, 'models/electra-larger-ensemble')
RIDGE_ENSEMBLE_2 = os.path.join(BASE_PATH, 'models/huge-ensemble')

# Functions

In [None]:
def predict_fast(model_name=None, data=None, init_model=None, tokenizer=None, num_labels=1, is_multilabel=False, output_logits=False, use_softmax=False):
  device = "cuda:0"
  tokenizer = AutoTokenizer.from_pretrained(model_name) if model_name else tokenizer
  config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) if model_name else None
  model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config) if model_name else init_model
  model.to(device)
  model.eval()
  y_pred = []
  batches = chunks(data, 32)
  for batch in tqdm(batches):
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs['input_ids'].to(device)
    attention = inputs['attention_mask'].to(device)
    inputs = {
        'input_ids': input_ids,
        'attention_mask': attention
    }
    with torch.no_grad():        
          outputs = model(**inputs)
    if not use_softmax:
      logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
    else:
      logits = nn.functional.softmax(outputs.logits, dim=-1).detach().cpu().numpy().squeeze().tolist()
    if is_multilabel and not output_logits:
      logits = np.argmax(logits, axis=-1)
    y_pred.extend(logits)
  del model
  gc.collect()
  return y_pred

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
def rms(y_actual, y_predicted):
  return mean_squared_error(y_actual, y_predicted, squared=False)

In [None]:
def make_ensembler_predictions(fold_predictions, ensembler_dirs, return_mean=True):
  final_predictions = []
  for idx, predictions in enumerate(fold_predictions):
    clf = load(ensembler_dirs[idx])
    Y = np.column_stack(predictions)
    y_preds = clf.predict(Y)
    final_predictions.append(y_preds)
  
  if return_mean:
    preds = np.vstack(final_predictions)
    del final_predictions
    return np.mean(preds, axis=0)
  else:
    return final_predictions

# Load test data

In [None]:
# You will need to place the test data in /data/test/test.csv
test_df = pd.read_csv(os.path.join(BASE_PATH, 'data/test/test.csv'))
test_tx = [str(t) for t in test_df.excerpt.values]

# Prediction

In [None]:
# Getting transformer predictions

model_dirs = [
    ALBERT_TRAINED_1,
    DEBERTA_TRAINED_1,
    ALBERT_TRAINED_2,
    DEBERTA_TRAINED_2,
    ROBERTA_TRAINED_1,
    ELECTRA_TRAINED_1
]

fold_predictions = {
    'fold_0': [],
    'fold_1': [],
    'fold_2': [],
    'fold_3': [],
    'fold_4': [],
    'fold_5': [],
}

for i in range(6):
  for model in model_dirs:
    preds = predict_fast(model_name=os.path.join(model, 'model_fold_' + str(i) + '/best'), data=test_tx)
    fold_predictions['fold_' + str(i)].append(np.array(preds))

# Getting predictions from special models
albert_single_preds = predict_fast(model_name=os.path.join(ALBERT_TRAINED_3, 'best'), data=test_tx)
deberta_bs_0 = predict_fast(model_name=os.path.joun(DEBERTA_TRAINED_3, 'model_fold_0/best'), data=test_tx)
deberta_bs_1 = predict_fast(model_name=os.path.joun(DEBERTA_TRAINED_3, 'model_fold_1/best'), data=test_tx)

# Ensembling

In [None]:
ridge_dirs_1 = []
ridge_dirs_2 = []

for i in [1,2,4,5]:
  ridge_dirs_1.append(os.path.join(RIDGE_ENSEMBLE_1, 'model_fold_' + str(i) + 'ridge_model.joblib'))

for i in range(6):
  ridge_dirs_2.append(os.path.join(RIDGE_ENSEMBLE_2, 'model_fold_' + str(i) + 'ridge_model.joblib'))


ensemble_1_preds = make_ensembler_predictions(
    fold_predictions=[fold_predictions['fold_' + str(i)] for i in [1,2,4,5]],
    ensembler_dirs=ridge_dirs_1
)

ensemble_2_preds = make_ensembler_predictions(
    fold_predictions=[fold_predictions['fold_' + str(i)] for i in range(6)],
    ensembler_dirs=ridge_dirs_2
)

In [None]:
bs_mean_preds = np.array(deberta_bs_0) * 0.5 + np.array(deberta_bs_1) * 0.5
bs_alb_mean_preds = np.array(albert_single_preds) * 0.65 + np.array(bs_mean_preds) * 0.35

In [None]:
final_predictions = np.array(ensemble_1_preds) * 3./8. + np.array(ensemble_2_preds) * 2./8. + np.array(bs_alb_mean_preds) * 3./8.

# Submission

In [None]:
submission_df = pd.DataFrame({'id': test_df.id, 'target': final_predictions})
submission_df.to_csv(os.path.join(BASE_PATH, 'data/submission/submission.csv'))