In [1]:
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from torch.cuda.amp import autocast, GradScaler
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoConfig, AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from glob import glob
import random
import warnings
import gc
import os
import seaborn as sns
from tqdm.auto import tqdm
import torch.nn as nn
import torch
plt.style.use('seaborn-pastel')
sns.set_palette("winter_r")
warnings.filterwarnings('ignore')
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Config:
    script = "roberta/feature_extraction"

    n_splits = 5
    seed = 42

    batch_size = 16
    n_classes = 4
    n_epochs = 10

    # bert
    model_name = "roberta-base"
    weight_decay = 2e-5
    beta = (0.9, 0.98)
    max_len = 128
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 1

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # Reka Env
    dir_path = "/home/abe/kaggle/signate-sc2022"


In [3]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


def path_setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    cfg.INPUT = os.path.join(Config.dir_path, 'input')
    cfg.OUTPUT = os.path.join(Config.dir_path, 'output')
    cfg.SUBMISSION = os.path.join(Config.dir_path, 'submissions')
    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, Config.script)
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, "model")
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, "preds")
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, "fig")
    cfg.NOTEBOOK = os.path.join(Config.dir_path, "Notebooks")
    cfg.SCRIPT = os.path.join(Config.dir_path, "scripts")

    # make dir
    for dir in [
            cfg.INPUT,
            cfg.OUTPUT,
            cfg.SUBMISSION,
            cfg.OUTPUT_EXP,
            cfg.EXP_MODEL,
            cfg.EXP_PREDS,
            cfg.EXP_FIG,
            cfg.NOTEBOOK,
            cfg.SCRIPT]:
        os.makedirs(dir, exist_ok=True)

    return cfg

In [4]:
seed_everything(Config.seed)
cfg = path_setup(Config)

In [9]:
class BERTModel(nn.Module):
    def __init__(self, model_name="roberta-base", criterion=None):
        super().__init__()
        self.criterion = criterion
        self.config = AutoConfig.from_pretrained(
            model_name,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            model_name,
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 4),
        )

    def forward(self, inputs, labels=None):
        outputs = self.backbone(**inputs)
        
        if labels is None:
            
            logits = self.fc(outputs["last_hidden_state"][:, 0, :])
            return logits, outputs
        
        outputs = outputs["last_hidden_state"][:, 0, :]
        logits = self.fc(outputs)
        loss = self.criterion(logits, labels)
        return logits, loss


class BertSequenceVectorizer:
    def __init__(self, model, tokenizer, max_len):
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.tokenizer = tokenizer
        self.bert_model = model
        self.bert_model = self.bert_model.to(self.device)
        self.max_len = max_len

    def vectorize(self, sentence: str) -> np.array:
        inp = self.tokenizer.encode(sentence)
        len_inp = len(inp)

        if len_inp >= self.max_len:
            inputs = inp[:self.max_len]
            masks = [1] * self.max_len
        else:
            inputs = inp + [0] * (self.max_len - len_inp)
            masks = [1] * len_inp + [0] * (self.max_len - len_inp)

        inputs_tensor = torch.tensor(
            [inputs], dtype=torch.long).to(
            self.device)
        masks_tensor = torch.tensor([masks], dtype=torch.long).to(self.device)
        _, bert_out = self.bert_model({"input_ids" : inputs_tensor, "attention_mask": masks_tensor})
        seq_out, pooled_out = bert_out['last_hidden_state'], bert_out['pooler_output']

        if torch.cuda.is_available():
            return seq_out[0][0].cpu().detach().numpy()
        else:
            return seq_out[0][0].detach().numpy()


def vectorize(df: pd.DataFrame, tokenizer, model_paths):
    assert "description" in df.columns
    df['feature'] = 0
    print('\n'.join(model_paths))
    for _, model_weight in enumerate(model_paths):
        model = BERTModel()
        model.load_state_dict(torch.load(model_weight))
        model = model.to(cfg.device)

        BSV = BertSequenceVectorizer(model, tokenizer, cfg.max_len)
        df['feature'] += df['description'].progress_apply(lambda x: BSV.vectorize(x))

        del model
        gc.collect()

    df['feature'] = df['feature'] / len(model_paths)
    return pd.DataFrame(np.stack(df['feature']))


In [6]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

In [7]:
model_paths = [p for p in sorted(glob(os.path.join(cfg.dir_path + "/output/roberta/baseline/model/", "fold*.pth")))]
train = pd.read_csv(os.path.join(cfg.INPUT, "train_cleaned.csv"))
print(train['description'].head(10))

0    develop cutting edge web applications perform ...
1    designs develops high quality scalable efficie...
2    functions point person network strategy work r...
3    work technical design development release depl...
4    quantify resources required task project relat...
5    participates standard business technical infor...
6    create project plans establish timelines estab...
7    facilitate pre sales initiatives live demonstr...
8    consolidate dashboards across team help drive ...
9    maintain improve existing predictive models ev...
Name: description, dtype: object


In [10]:
feat_train = vectorize(train, tokenizer, model_paths)
print(feat_train.head())
print(feat_train.shape)

/home/abe/kaggle/signate-sc2022/output/roberta/baseline/model/fold0.pth
/home/abe/kaggle/signate-sc2022/output/roberta/baseline/model/fold1.pth
/home/abe/kaggle/signate-sc2022/output/roberta/baseline/model/fold2.pth
/home/abe/kaggle/signate-sc2022/output/roberta/baseline/model/fold3.pth
/home/abe/kaggle/signate-sc2022/output/roberta/baseline/model/fold4.pth


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1516/1516 [00:13<00:00, 110.84it/s]


        0         1         2         3         4         5         6    \
0 -0.054320  0.163751 -0.316622  0.054401  0.186521 -0.138654 -0.080279   
1  0.007138  0.245675 -0.074354  0.119374 -0.006550 -0.112559 -0.089299   
2  0.175423  0.014750  0.098829 -0.109835 -0.144359  0.000833 -0.143365   
3 -0.181164  0.174353 -0.332650  0.115873  0.305775 -0.111456 -0.005911   
4  0.248364  0.051887  0.016210 -0.128455 -0.241048 -0.063836 -0.077739   

        7         8         9    ...       758       759       760       761  \
0 -0.165954 -0.291944 -0.159278  ... -0.076210 -0.080901 -0.324941 -0.050864   
1 -0.085308 -0.049521 -0.034661  ... -0.044947 -0.102488 -0.210471  0.045380   
2 -0.042509  0.093194  0.068069  ... -0.027346 -0.108323 -0.004558 -0.043174   
3 -0.079442 -0.189538 -0.163131  ... -0.114342 -0.014626 -0.263512 -0.006301   
4 -0.099214  0.078999  0.171830  ... -0.095246 -0.020634 -0.006083  0.022993   

        762       763       764       765       766       767  
0 -0