<h3>📌 Train Notebook:</h3> <h4><a href='https://www.kaggle.com/code/debarshichanda/pytorch-feedback-deberta-v3-baseline'>https://www.kaggle.com/code/debarshichanda/pytorch-feedback-deberta-v3-baseline</a></h4>

* cv0.6819
* lb:652
* here code:https://www.kaggle.com/code/quincyqiang/feedback-meanpoolingv2-inference
* thanks to:Debarshi Chanda
* hits：Based on the baseline, trying the way of text splicing:add `discourse_type` and training on 5folds

```
text = discourse_type+self.tokenizer.sep_token+discourse +self.tokenizer.sep_token + " " + essay
```

In [1]:
import os
import gc
import cv2
import copy
import time
import random
import joblib

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig

# Utils
from tqdm import tqdm

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [2]:
MODEL_DIR='../input/com-base-lr2-e5-textchange-4cls'
MODEL_PATHS = [
    f'{MODEL_DIR}/Loss-Fold-0.bin',
    f'{MODEL_DIR}/Loss-Fold-1.bin',
    f'{MODEL_DIR}/Loss-Fold-2.bin',
    f'{MODEL_DIR}/Loss-Fold-3.bin',
    f'{MODEL_DIR}/Loss-Fold-4.bin',
]

In [3]:
TRAIN_DIR = "../input/feedback-prize-effectiveness/train"
TEST_DIR = "../input/feedback-prize-effectiveness/test"

In [4]:
CONFIG = dict(
    seed = 42,
    model_name = '../input/deberta-v3-base/deberta-v3-base',
    test_batch_size = 16,
    max_length = 512,
    num_classes = 3,
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    gradient_checkpoint=False
)

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def get_essay(essay_id):
    essay_path = os.path.join(TEST_DIR, f"{essay_id}.txt")
    essay_text = open(essay_path, 'r').read()
    return essay_text

def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

def get_score(y_true, y_pred):
    y_pred = softmax(y_pred)
    score = log_loss(y_true, y_pred)
    return round(score, 5)


In [6]:
df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
df['essay_text'] = df['essay_id'].apply(get_essay)
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,essay_text
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,Making choices in life can be very difficult. ...
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,Making choices in life can be very difficult. ...
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,Making choices in life can be very difficult. ...
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,Making choices in life can be very difficult. ...
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,Making choices in life can be very difficult. ...


In [7]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text
df['discourse_text'] = df['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
df['essay_text'] = df['essay_text'].apply(lambda x : resolve_encodings_and_normalize(x))
#encoder = LabelEncoder()
#df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])

# with open("le.pkl", "wb") as fp:
#     joblib.dump(encoder, fp)

In [8]:
with open(f"../input/{MODEL_DIR}/le.pkl", "rb") as fp:
    encoder = joblib.load(fp)
    
encoder.classes_

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


array(['Adequate', 'Effective', 'Ineffective'], dtype=object)

In [9]:
# class FeedBackDataset(Dataset):
#     def __init__(self, df, tokenizer, max_length):
#         self.df = df
#         self.max_len = max_length
#         self.tokenizer = tokenizer
#         self.discourse = df['discourse_text'].values
#         self.essay = df['essay_text'].values
#         #self.targets = df['discourse_effectiveness'].values
#         self.discourse_type = df['discourse_type'].values

#     def __len__(self):
#         return len(self.df)

#     def __getitem__(self, index):
#         discourse = self.discourse[index]
#         discourse_type = self.discourse_type[index]
#         essay = self.essay[index]
#         text = discourse_type+self.tokenizer.sep_token+discourse +self.tokenizer.sep_token + " " + essay
#         inputs = self.tokenizer.encode_plus(
#             text,
#             truncation=True,
#             add_special_tokens=True,
#             max_length=self.max_len,
#             padding='max_length'
#         )

#         return {
#             'input_ids': inputs['input_ids'],
#             'attention_mask': inputs['attention_mask'],
#             #'target': self.targets[index]
#         }

class FeedBackDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.max_len = max_length
        self.tokenizer = tokenizer
        self.discourse_type = df['discourse_type'].values
        self.discourse = df['discourse_text'].values
        self.essay = df['essay_text'].values
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        discourse_type = self.discourse_type[index]
        discourse = self.discourse[index]
        
        essay = self.essay[index]
        text = discourse_type+self.tokenizer.sep_token+discourse +self.tokenizer.sep_token + " " + essay
        inputs = self.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.max_len,
                        padding='max_length'
                    )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask']
        }

In [10]:
class Collate:
    def __init__(self, tokenizer, isTrain=True):
        self.tokenizer = tokenizer
        self.isTrain = isTrain
        # self.args = args

    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        if self.isTrain:
            output["target"] = [sample["target"] for sample in batch]

        # calculate max token length of this batch
        batch_max = max([len(ids) for ids in output["input_ids"]])

        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors
        output["input_ids"] = torch.tensor(output["input_ids"], dtype=torch.long)
        output["attention_mask"] = torch.tensor(output["attention_mask"], dtype=torch.long)
        if self.isTrain:
            output["target"] = torch.tensor(output["target"], dtype=torch.long)

        return output

collate_fn = Collate(CONFIG["tokenizer"], isTrain=False)

In [11]:
test_dataset = FeedBackDataset(df, CONFIG['tokenizer'], max_length=CONFIG['max_length'])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['test_batch_size'],
                         num_workers=2, shuffle=False,collate_fn=collate_fn, pin_memory=True)

In [12]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class MeanMaxPooling(nn.Module):
    def __init__(self):
        super(MeanMaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        mean_pooling_embeddings = torch.mean(last_hidden_state, 1)
        _, max_pooling_embeddings = torch.max(last_hidden_state, 1)
        mean_max_embeddings = torch.cat((mean_pooling_embeddings, max_pooling_embeddings), 1)
        return mean_max_embeddings


class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)

    def forward(self, all_hidden_states):
        ## forward
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out


class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

In [13]:
class FeedBackModel(nn.Module):
    def __init__(self, model_name):
        super(FeedBackModel, self).__init__()
        self.cfg = CONFIG
        self.config = AutoConfig.from_pretrained(model_name,output_hidden_states=True)
        self.model = AutoModel.from_pretrained(model_name,config=self.config)
        # gradient checkpointing  梯度检查点
        if CONFIG['gradient_checkpoint']:
            self.model.gradient_checkpointing_enable()
            print(f"Gradient Checkpointing: {self.model.is_gradient_checkpointing}")
        self.bilstm = nn.LSTM(self.config.hidden_size, (self.config.hidden_size) // 2, num_layers=2,
                              dropout=self.config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)

        self.dropout = nn.Dropout(0.2)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        #-----------------
        #self.pooler=WeightedLayerPooling(self.config.num_hidden_layers)
        #self.pooler=MeanPooling()

        self.output = nn.Sequential(
            nn.Linear(self.config.hidden_size*4, CONFIG['num_classes'])
            # nn.Linear(256, self.cfg.target_size)
        )

    def loss(self, outputs, targets):
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(outputs, targets)
        return loss

    def monitor_metrics(self, outputs, targets):
        device = targets.get_device()
        # print(outputs)
        # print(targets)
        mll = log_loss(
            targets.cpu().detach().numpy(),
            softmax(outputs.cpu().detach().numpy()),
            labels=[0, 1, 2],
        )
        return mll

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    # def forward(self, ids, mask):
    #     out = self.model(input_ids=ids, attention_mask=mask,
    #                      output_hidden_states=False)
    #     out = self.pooler(out.last_hidden_state, mask)
    #     out = self.drop(out)
    #     outputs = self.fc(out)
    #     return outputs
    def forward(self, ids, mask, token_type_ids=None, targets=None):
        if token_type_ids:
            transformer_out = self.model(ids, mask, token_type_ids)
        else:
            transformer_out = self.model(ids, mask)

        # LSTM/GRU header
        # all_hidden_states = torch.stack(transformer_out[1])
        # sequence_output = self.pooler(all_hidden_states)3
        all_hidden_states = torch.stack(transformer_out[1])

        concatenate_pooling = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]), -1
        )
        sequence_output = concatenate_pooling[:, 0]
        # simple CLS
        #transformer_out = self.pooler(transformer_out.last_hidden_state,mask)
        #sequence_output = transformer_out
        #sequence_output = transformer_out[0][:, 0, :]
        #all_hidden_states = torch.stack(transformer_out[1])
        #sequence_output=self.pooler(all_hidden_states)

        # Main task
        logits1 = self.output(self.dropout1(sequence_output))
        logits2 = self.output(self.dropout2(sequence_output))
        logits3 = self.output(self.dropout3(sequence_output))
        logits4 = self.output(self.dropout4(sequence_output))
        logits5 = self.output(self.dropout5(sequence_output))
        logits = (logits1 + logits2 + logits3 + logits4 + logits5) / 5

        if targets is not None:
            metric = self.monitor_metrics(logits, targets)
            return logits

        return logits

In [14]:
@torch.no_grad()
def valid_fn(model, dataloader, device):
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    PREDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        
        outputs = model(ids, mask)
        outputs = F.softmax(outputs, dim=1)
        PREDS.append(outputs.cpu().detach().numpy()) 
    
    PREDS = np.concatenate(PREDS)
    gc.collect()
    
    return PREDS

In [15]:
def inference(model_paths, dataloader, device):
    final_preds = []
    for i, path in enumerate(model_paths):
        model = FeedBackModel(CONFIG['model_name'])
        model.to(CONFIG['device'])
        model.load_state_dict(torch.load(path))
        
        print(f"Getting predictions for model {i+1}")
        preds = valid_fn(model, dataloader, device)
        final_preds.append(preds)
    
    final_preds = np.array(final_preds)
    final_preds = np.mean(final_preds, axis=0)
    return final_preds

In [16]:
preds = inference(MODEL_PATHS, test_loader, CONFIG['device'])

Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.bias', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Getting predictions for model 1


100%|██████████| 1/1 [00:01<00:00,  1.46s/it]
Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.bias', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

Getting predictions for model 2


100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.bias', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

Getting predictions for model 3


100%|██████████| 1/1 [00:00<00:00,  2.26it/s]
Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.bias', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

Getting predictions for model 4


100%|██████████| 1/1 [00:00<00:00,  1.74it/s]
Some weights of the model checkpoint at ../input/deberta-v3-base/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifer.bias', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

Getting predictions for model 5


100%|██████████| 1/1 [00:00<00:00,  2.29it/s]


In [17]:
preds

array([[0.62027276, 0.33876267, 0.04096456],
       [0.8803393 , 0.09604445, 0.02361617],
       [0.79070073, 0.17820826, 0.03109107],
       [0.8160946 , 0.12804925, 0.05585612],
       [0.7913507 , 0.15905914, 0.04959013],
       [0.58156335, 0.3754637 , 0.04297299],
       [0.47678494, 0.49285156, 0.03036351],
       [0.77574193, 0.18243572, 0.04182231],
       [0.52919805, 0.4350275 , 0.0357745 ],
       [0.76650774, 0.19876778, 0.03472452]], dtype=float32)

In [18]:
sample = pd.read_csv("../input/feedback-prize-effectiveness/sample_submission.csv")
sample.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.2,0.6,0.4
1,5a88900e7dc1,3.0,6.0,1.0
2,9790d835736b,1.0,2.0,3.0
3,75ce6d68b67b,0.33,0.34,0.33
4,93578d946723,0.01,0.24,0.47


In [19]:
sample['Adequate'] = preds[:, 0]
sample['Effective'] = preds[:, 1]
sample['Ineffective'] = preds[:, 2]

sample.head()

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.040965,0.620273,0.338763
1,5a88900e7dc1,0.023616,0.880339,0.096044
2,9790d835736b,0.031091,0.790701,0.178208
3,75ce6d68b67b,0.055856,0.816095,0.128049
4,93578d946723,0.04959,0.791351,0.159059


In [20]:
sample.to_csv('submission.csv', index=False)