# Overview

https://uku28motab.feishu.cn/docs/doccnUDbEhudHm2V440lcY87B1c

# Model 1

The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [1]:
# 导入相关库文件
import os
import math
import random
import time

import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig

from sklearn.model_selection import KFold
from sklearn.svm import SVR

import gc
gc.enable()

In [2]:
# 参数配置
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.5, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1, 1)]
ROBERTA_PATH = "/kaggle/input/roberta-base"
TOKENIZER_PATH = "/kaggle/input/roberta-base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

DEVICE

'cuda'

In [3]:
# 数据读取
test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [4]:
# 读取分词器
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

In [5]:
# 数据处理
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()

        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

In [6]:
# 模型
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.5,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # 单元格数为 MAX_LEN.
        # 每个单元(cell)隐藏层的大小为 768 (for roberta-base).

        weights = self.attention(last_layer_hidden_states)
                
        # 权重的形状为 BATCH_SIZE x MAX_LEN x 1
        # 最后一层隐藏层的形状为 BATCH_SIZE x MAX_LEN x 768        

        # 上下文向量的形状为 BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # 将上下文简化成预测分数
        return self.regressor(context_vector)

In [7]:
# 模型1 预测函数
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [8]:
NUM_MODELS = 5

all_predictions = np.zeros((NUM_MODELS, len(test_df)))

test_dataset = LitDataset(test_df, inference_only=True)

test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)

for model_index in tqdm(range(NUM_MODELS)):            
    model_path = f"../input/commonlit-roberta-0467/model_{model_index + 1}.pth"
    print(f"\nUsing {model_path}")
                        
    model = LitModel()
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))    
    model.to(DEVICE)
        
    all_predictions[model_index] = predict(model, test_loader)
            
    del model
    gc.collect()
    torch.cuda.empty_cache()

  0%|          | 0/5 [00:00<?, ?it/s]


Using ../input/commonlit-roberta-0467/model_1.pth


 20%|██        | 1/5 [00:20<01:21, 20.47s/it]


Using ../input/commonlit-roberta-0467/model_2.pth


 40%|████      | 2/5 [00:29<00:41, 13.99s/it]


Using ../input/commonlit-roberta-0467/model_3.pth


 60%|██████    | 3/5 [00:38<00:22, 11.47s/it]


Using ../input/commonlit-roberta-0467/model_4.pth


 80%|████████  | 4/5 [00:46<00:10, 10.16s/it]


Using ../input/commonlit-roberta-0467/model_5.pth


100%|██████████| 5/5 [00:54<00:00, 10.97s/it]


In [9]:
model1_predictions = all_predictions.mean(axis=0)

# Model 2
Inspired from [https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3](https://www.kaggle.com/rhtsingh/commonlit-readability-prize-roberta-torch-infer-3)

In [10]:
test = test_df
# 导入相关库文件
from glob import glob
import os
import matplotlib.pyplot as plt
import json
from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import (
    Dataset, DataLoader, 
    SequentialSampler, RandomSampler
)
from transformers import RobertaConfig
from transformers import (
    get_cosine_schedule_with_warmup, 
    get_cosine_with_hard_restarts_schedule_with_warmup
)
from transformers import RobertaTokenizer
from transformers import RobertaModel
from IPython.display import clear_output
# 提取样本中的特征
def convert_examples_to_features(data, tokenizer, max_len, is_test=False):
    data = data.replace('\n', '')
    tok = tokenizer.encode_plus(
        data, 
        max_length=max_len, 
        truncation=True,
        return_attention_mask=True,
        return_token_type_ids=True
    )
    curr_sent = {}
    padding_length = max_len - len(tok['input_ids'])
    curr_sent['input_ids'] = tok['input_ids'] + ([0] * padding_length)
    curr_sent['token_type_ids'] = tok['token_type_ids'] + \
        ([0] * padding_length)
    curr_sent['attention_mask'] = tok['attention_mask'] + \
        ([0] * padding_length)
    return curr_sent
# 数据集寻回
class DatasetRetriever(Dataset):
    def __init__(self, data, tokenizer, max_len, is_test=False):
        self.data = data
        self.excerpts = self.data.excerpt.values.tolist()
        self.tokenizer = tokenizer
        self.is_test = is_test
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, item):
        if not self.is_test:
            excerpt, label = self.excerpts[item], self.targets[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
                'label':torch.tensor(label, dtype=torch.double),
            }
        else:
            excerpt = self.excerpts[item]
            features = convert_examples_to_features(
                excerpt, self.tokenizer, 
                self.max_len, self.is_test
            )
            return {
                'input_ids':torch.tensor(features['input_ids'], dtype=torch.long),
                'token_type_ids':torch.tensor(features['token_type_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(features['attention_mask'], dtype=torch.long),
            }
# 模型
class CommonLitModel(nn.Module):
    # 初始化变量
    def __init__(
        self, 
        model_name, 
        config,  
        multisample_dropout=True,
        output_hidden_states=False
    ):
        
        super(CommonLitModel, self).__init__()
        self.config = config
        self.roberta = RobertaModel.from_pretrained(
            model_name, 
            output_hidden_states=output_hidden_states
        )
        
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        
        if multisample_dropout:
            self.dropouts = nn.ModuleList([
                nn.Dropout(0.5) for _ in range(5)
            ])
        else:
            self.dropouts = nn.ModuleList([nn.Dropout(0.3)])
            
        self.regressor = nn.Linear(config.hidden_size, 1)
        self._init_weights(self.layer_norm)
        self._init_weights(self.regressor)
    # 定义初始化权重
    def _init_weights(self, module):
        
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
                
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
                
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    # 前向传播
    def forward(
        self, 
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[1]
        sequence_output = self.layer_norm(sequence_output)
         # max-avg head 最大平均多头(并联了max pool和 mean pool的结果)
        # average_pool = torch.mean(sequence_output, 1)
        # max_pool, _ = torch.max(sequence_output, 1)
        # concat_sequence_output = torch.cat((average_pool, max_pool), 1)
 
        # multi-sample dropout
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                logits = self.regressor(dropout(sequence_output))
            else:
                logits += self.regressor(dropout(sequence_output))
        
        logits /= len(self.dropouts)
 
        # 计算损失值
        loss = None
        
        if labels is not None:
            loss_fn = torch.nn.MSELoss()
            logits = logits.view(-1).to(labels.dtype)
            loss = torch.sqrt(loss_fn(logits, labels.view(-1)))
        
        output = (logits,) + outputs[1:]
        return ((loss,) + output) if loss is not None else output
# 读取预训练模型参数，返回预训练模型参数和分词器
def make_model(model_name, num_labels=1):
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    config = RobertaConfig.from_pretrained(model_name)
    config.update({'num_labels':num_labels})
    model = CommonLitModel(model_name, config=config)
    return model, tokenizer
# 读取处理好的文本，通过DataLoader转换成张量传入模型中
def make_loader(
    data, 
    tokenizer, 
    max_len,
    batch_size,
):
    
    test_dataset = DatasetRetriever(data, tokenizer, max_len, is_test=True)
    test_sampler = SequentialSampler(test_dataset)
    
    test_loader = DataLoader(
        test_dataset, 
        batch_size=batch_size // 2, 
        sampler=test_sampler, 
        pin_memory=False, 
        drop_last=False, 
        num_workers=0
    )

    return test_loader
# 测试模型
class Evaluator:
    def __init__(self, model, scalar=None):
        self.model = model
        self.scalar = scalar

    def evaluate(self, data_loader, tokenizer):
        preds = []
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for batch_idx, batch_data in enumerate(data_loader):
                input_ids, attention_mask, token_type_ids = batch_data['input_ids'], \
                    batch_data['attention_mask'], batch_data['token_type_ids']
                input_ids, attention_mask, token_type_ids = input_ids.cuda(), \
                    attention_mask.cuda(), token_type_ids.cuda()
                
                if self.scalar is not None:
                    with torch.cuda.amp.autocast():
                        outputs = self.model(
                            input_ids=input_ids,
                            attention_mask=attention_mask,
                            token_type_ids=token_type_ids
                        )
                else:
                    outputs = self.model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids
                    )
                
                logits = outputs[0].detach().cpu().numpy().squeeze().tolist()
                preds += logits
        return preds
# 参数配置
def config(fold, model_name, load_model_path):
    torch.manual_seed(2021)
    torch.cuda.manual_seed(2021)
    torch.cuda.manual_seed_all(2021)
    
    max_len = 250
    batch_size = 8

    model, tokenizer = make_model(
        model_name=model_name, 
        num_labels=1
    )
    
    model.load_state_dict(
        torch.load(f'{load_model_path}/model{fold}.bin')
    )
    
    test_loader = make_loader(
        test, tokenizer, max_len=max_len,
        batch_size=batch_size
    )

    if torch.cuda.device_count() >= 1:
        print('Model pushed to {} GPU(s), type {}.'.format(
            torch.cuda.device_count(), 
            torch.cuda.get_device_name(0))
        )
        model = model.cuda() 
    else:
        raise ValueError('CPU training is not supported')

    # scaler = torch.cuda.amp.GradScaler()
    scaler = None
    return (
        model, tokenizer, 
        test_loader, scaler
    )
# 运行程序，记录测试集训练时间并得到测试结果
def run(fold=0, model_name=None, load_model_path=None):
    # 读取参数
    model, tokenizer, \
        test_loader, scaler = config(fold, model_name, load_model_path)
    # 训练时间计算
    import time

    evaluator = Evaluator(model, scaler)
    # 初始化训练时间列表
    test_time_list = []

    torch.cuda.synchronize()
    tic1 = time.time()

    preds = evaluator.evaluate(test_loader, tokenizer)

    torch.cuda.synchronize()
    tic2 = time.time() 
    test_time_list.append(tic2 - tic1)
    
    del model, tokenizer, test_loader, scaler
    gc.collect()
    torch.cuda.empty_cache()
    
    return preds

In [11]:
# 使用了一个roberta_base和两个robert_large模型
pred_df1 = pd.DataFrame()
pred_df2 = pd.DataFrame()
pred_df3 = pd.DataFrame()

for fold in tqdm(range(5)):
    pred_df1[f'fold{fold}'] = run(fold%5, '../input/roberta-base/', '../input/commonlit-roberta-base-i/')
    pred_df2[f'fold{fold+5}'] = run(fold%5, '../input/robertalarge/', '../input/roberta-large-itptfit/')
    pred_df3[f'fold{fold+10}'] = run(fold%5, '../input/robertalarge/', '../input/commonlit-roberta-large-ii/')

  0%|          | 0/5 [00:00<?, ?it/s]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 20%|██        | 1/5 [01:06<04:27, 66.89s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 40%|████      | 2/5 [02:00<02:57, 59.04s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 60%|██████    | 3/5 [02:53<01:52, 56.15s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


 80%|████████  | 4/5 [03:46<00:55, 55.07s/it]

Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.
Model pushed to 1 GPU(s), type Tesla P100-PCIE-16GB.


100%|██████████| 5/5 [04:38<00:00, 55.69s/it]


In [12]:
# 得到所有模型分别的预测结果
pred_df1 = np.array(pred_df1)
pred_df2 = np.array(pred_df2)
pred_df3 = np.array(pred_df3)
# 对不同模型根据成绩和预估可能过拟合的情况分配相应权重
model2_predictions = (pred_df2.mean(axis=1) * 0.5) + (pred_df1.mean(axis=1) * 0.3) + (pred_df3.mean(axis=1) * 0.2)

# Model 3

Inspired from: https://www.kaggle.com/ragnar123/commonlit-readability-roberta-tf-inference

In [13]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets

In [14]:
# 配置
# 训练折数
FOLDS = 5

# 模型读取文本的最大长度
MAX_LEN = 250

# G获取我们想使用的模型
MODEL = '../input/tfroberta-base'

# 加载模型分词器
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

In [15]:
# 这个函数根据转换器模型分词器对文本进行分词（分词器tokenizer的作用是将文本转换成序列）
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])

# 该函数的作用为编码输入文本
def encode_texts(x_test, MAX_LEN):
    x_test = regular_encode(x_test.tolist(), tokenizer, maxlen = MAX_LEN)
    return x_test

# 函数作用为构建我们的模型
def build_roberta_base_model(max_len = MAX_LEN):
    transformer = TFRobertaModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    return model

# 推理函数
def roberta_base_inference1():
    # 读取测试数据
    df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
    # 获得文本特征
    x_test = df['excerpt']
    # Roberta 分词器(tokenizer)编码文本
    x_test = encode_texts(x_test, MAX_LEN)
    # 初始化一个空向量来存储预测
    predictions = np.zeros(len(df))
    # 5个预测模型
    for i in range(FOLDS):
        print('\n')
        print('-'*50)
        print(f'Predicting with model {i + 1}')
        # 构建模型
        model = build_roberta_base_model(max_len = MAX_LEN)
        # 读取预训练权重
        model.load_weights(f'../input/epochs-100-lr-4e5-seed-123/Roberta_Base_123_{i + 1}.h5')
        # 预测
        fold_predictions = model.predict(x_test).reshape(-1)
        # 得到每折预测结果的平均值
        predictions += fold_predictions / FOLDS
        
        del model
        gc.collect()
        torch.cuda.empty_cache()
        
    return predictions

model3_predictions = roberta_base_inference1()



--------------------------------------------------
Predicting with model 1


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tfroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.




--------------------------------------------------
Predicting with model 2


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tfroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.




--------------------------------------------------
Predicting with model 3


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tfroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.




--------------------------------------------------
Predicting with model 4


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tfroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.




--------------------------------------------------
Predicting with model 5


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tfroberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


## Model 4 

Inspired from: https://www.kaggle.com/jcesquiveld/best-transformer-representations

In [16]:
import os
import numpy as np
import pandas as pd
import random

from transformers import AutoConfig, AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, logging

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
logging.set_verbosity_error()

In [17]:
# 训练输入数据和模型的地址
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_DIR = '../input/roberta-transformers-pytorch/roberta-large'
CHECKPOINT_DIR1 = '../input/clrp-mean-pooling/'
CHECKPOINT_DIR2 = '../input/clrp-mean-pooling-seeds-17-43/'

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MAX_LENGTH = 300
TEST_BATCH_SIZE = 1
HIDDEN_SIZE = 1024

NUM_FOLDS = 5
SEEDS = [113, 71]

test = pd.read_csv(os.path.join(INPUT_DIR, 'test.csv'))

In [18]:
# 模型定义（平均池化模型）
class MeanPoolingModel(nn.Module):
    
    def __init__(self, model_name):
        super().__init__()
        
        config = AutoConfig.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, config=config)
        self.linear = nn.Linear(HIDDEN_SIZE, 1)
        self.loss = nn.MSELoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        
        outputs = self.model(input_ids, attention_mask)
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        logits = self.linear(mean_embeddings)
        
        preds = logits.squeeze(-1).squeeze(-1)
        
        if labels is not None:
            loss = self.loss(preds.view(-1).float(), labels.view(-1).float())
            return loss
        else:
            return preds

In [19]:
def get_test_loader(data):

    x_test = data.excerpt.tolist()
    
    tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

    encoded_test = tokenizer.batch_encode_plus(
        x_test, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )

    dataset_test = TensorDataset(
        encoded_test['input_ids'],
        encoded_test['attention_mask']
    )

    dataloader_test = DataLoader(
        dataset_test,
        sampler = SequentialSampler(dataset_test),
        batch_size=TEST_BATCH_SIZE
    )
    
    return dataloader_test

test_dataloader = get_test_loader(test)

In [20]:
# 模型预测结果保存 
all_predictions = []
for seed in SEEDS:
    
    fold_predictions = []
    
    for fold in tqdm(range(NUM_FOLDS)):
        model_path = f"model_{seed + 1}_{fold + 1}.pth"
        
        print(f"\nUsing {model_path}")
        
        if seed in [113, 71]:
            model_path = CHECKPOINT_DIR1 + f"model_{seed + 1}_{fold + 1}.pth"
        else:
            model_path = CHECKPOINT_DIR2 + f"model_{seed + 1}_{fold + 1}.pth"
            
        model = MeanPoolingModel(MODEL_DIR)
        model.load_state_dict(torch.load(model_path)) 
        model.to(DEVICE)
        model.eval()

        predictions = []
        for batch in test_dataloader:

            batch = tuple(b.to(DEVICE) for b in batch)

            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'labels':         None,
                     }

     
            preds = model(**inputs).item()
            predictions.append(preds)
            
        del model 
        gc.collect()
        torch.cuda.empty_cache() 
        
        fold_predictions.append(predictions)
    all_predictions.append(np.mean(fold_predictions, axis=0).tolist())
    
model4_predictions = np.mean(all_predictions,axis=0)

  0%|          | 0/5 [00:00<?, ?it/s]


Using model_114_1.pth

Using model_114_2.pth

Using model_114_3.pth

Using model_114_4.pth

Using model_114_5.pth


  0%|          | 0/5 [00:00<?, ?it/s]


Using model_72_1.pth

Using model_72_2.pth

Using model_72_3.pth

Using model_72_4.pth

Using model_72_5.pth


# Model 5

In [21]:
# 导入相关的库文件
import os
import gc
import sys
import cv2
import math
import time
import tqdm
import random
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim import Adam, lr_scheduler
from torch.utils.data import Dataset, DataLoader

from transformers import (AutoModel, AutoTokenizer, 
                          AutoModelForSequenceClassification)

import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff


from colorama import Fore, Back, Style
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA
c_ = Fore.CYAN
sr_ = Style.RESET_ALL

train_data = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
test_data = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
sample = pd.read_csv('../input/commonlitreadabilityprize/sample_submission.csv')

num_bins = int(np.floor(1 + np.log2(len(train_data))))
train_data.loc[:,'bins'] = pd.cut(train_data['target'],bins=num_bins,labels=False)

target = train_data['target'].to_numpy()
bins = train_data.bins.to_numpy()
# 计算rmse分数
def rmse_score(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

config = {
    'batch_size':128,
    'max_len':256,
    'nfolds':5,
    'seed':42,
}
# 设立随机种子
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed=config['seed'])
# 数据集处理
class CLRPDataset(Dataset):
    def __init__(self,df,tokenizer):
        self.excerpt = df['excerpt'].to_numpy()
        self.tokenizer = tokenizer
    
    def __getitem__(self,idx):
        encode = self.tokenizer(self.excerpt[idx],return_tensors='pt',
                                max_length=config['max_len'],
                                padding='max_length',truncation=True)
        return encode
    
    def __len__(self):
        return len(self.excerpt)
# 注意力头方法    
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim, num_targets):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim

        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))

        score = self.V(att)

        attention_weights = torch.softmax(score, dim=1)

        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
# 模型    
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.roberta = AutoModel.from_pretrained('../input/roberta-base')    
        self.head = AttentionHead(768,768,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(self.head.out_features,1)

    def forward(self,**xb):
        x = self.roberta(**xb)[0]
        x = self.head(x)
        return x    
# 获取向量        
def get_embeddings(df,path,plot_losses=True, verbose=True):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"{device} is used")
            
    model = Model()
    model.load_state_dict(torch.load(path))
    model.to(device)
    model.eval()
    
    tokenizer = AutoTokenizer.from_pretrained('../input/roberta-base')
    
    ds = CLRPDataset(df,tokenizer)
    dl = DataLoader(ds,
                  batch_size = config["batch_size"],
                  shuffle=False,
                  num_workers = 4,
                  pin_memory=True,
                  drop_last=False
                 )
        
    embeddings = list()
    with torch.no_grad():
        for i, inputs in tqdm(enumerate(dl)):
            inputs = {key:val.reshape(val.shape[0],-1).to(device) for key,val in inputs.items()}
            outputs = model(**inputs)
            outputs = outputs.detach().cpu().numpy()
            embeddings.extend(outputs)
    
    del model
    gc.collect()
    torch.cuda.empty_cache() 
    
    return np.array(embeddings)     

# 获取训练集和测试集的向量(由模型输出得到)
train_embeddings1 =  get_embeddings(train_data,'../input/roberta-svm-finetune/model0/model0.bin')
test_embeddings1 = get_embeddings(test_data,'../input/roberta-svm-finetune/model0/model0.bin')

train_embeddings2 =  get_embeddings(train_data,'../input/roberta-svm-finetune/model1/model1.bin')
test_embeddings2 = get_embeddings(test_data,'../input/roberta-svm-finetune/model1/model1.bin')

train_embeddings3 =  get_embeddings(train_data,'../input/roberta-svm-finetune/model2/model2.bin')
test_embeddings3 = get_embeddings(test_data,'../input/roberta-svm-finetune/model2/model2.bin')

train_embeddings4 =  get_embeddings(train_data,'../input/roberta-svm-finetune/model3/model3.bin')
test_embeddings4 = get_embeddings(test_data,'../input/roberta-svm-finetune/model3/model3.bin')

train_embeddings5 =  get_embeddings(train_data,'../input/roberta-svm-finetune/model4/model4.bin')
test_embeddings5 = get_embeddings(test_data,'../input/roberta-svm-finetune/model4/model4.bin')
# 获取svm预测分数
def get_preds_svm(X,y,X_test,bins=bins,nfolds=5,C=10,kernel='rbf'):
    scores = list()
    preds = np.zeros((X_test.shape[0]))
    
    kfold = StratifiedKFold(n_splits=config['nfolds'],shuffle=True,random_state=config['seed'])
    for k, (train_idx,valid_idx) in enumerate(kfold.split(X,bins)):
        model = SVR(C=C,kernel=kernel,gamma='auto')
        X_train,y_train = X[train_idx], y[train_idx]
        X_valid,y_valid = X[valid_idx], y[valid_idx]
        
        model.fit(X_train,y_train)
        prediction = model.predict(X_valid)
        score = rmse_score(prediction,y_valid)
        print(f'Fold {k} , rmse score: {score}')
        scores.append(score)
        preds += model.predict(X_test)
        
    print("mean rmse",np.mean(scores))
    return np.array(preds)/nfolds


svm_preds1 = get_preds_svm(train_embeddings1,target,test_embeddings1)
svm_preds2 = get_preds_svm(train_embeddings2,target,test_embeddings2)
svm_preds3 = get_preds_svm(train_embeddings3,target,test_embeddings3)
svm_preds4 = get_preds_svm(train_embeddings4,target,test_embeddings4)
svm_preds5 = get_preds_svm(train_embeddings5,target,test_embeddings5)
# 释放向量信息，清空显存，只保留预测结果
del train_embeddings1, test_embeddings1
gc.collect() 
del train_embeddings2, test_embeddings2
gc.collect() 
del train_embeddings3, test_embeddings3
gc.collect() 
del train_embeddings4, test_embeddings4
gc.collect() 
del train_embeddings5, test_embeddings5
gc.collect() 
torch.cuda.empty_cache()

model5_predictions = (svm_preds1 + svm_preds2 + svm_preds3 + svm_preds4 + svm_preds5)/5

cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  3.83it/s]


cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  4.11it/s]


cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  3.79it/s]


cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  4.03it/s]


cuda is used


23it [00:22,  1.04it/s]


cuda is used


1it [00:00,  4.18it/s]


Fold 0 , rmse score: 0.47371087181221916
Fold 1 , rmse score: 0.2759399586888456
Fold 2 , rmse score: 0.2757889178617948
Fold 3 , rmse score: 0.26363757173587454
Fold 4 , rmse score: 0.27470609978911387
mean rmse 0.3127566839775696
Fold 0 , rmse score: 0.24790297319827587
Fold 1 , rmse score: 0.5029747594364647
Fold 2 , rmse score: 0.23706084685476164
Fold 3 , rmse score: 0.23096696063760855
Fold 4 , rmse score: 0.2436154995918405
mean rmse 0.2925042079437903
Fold 0 , rmse score: 0.3776516226841351
Fold 1 , rmse score: 0.4078474456611516
Fold 2 , rmse score: 0.48660077143868474
Fold 3 , rmse score: 0.3653338741725369
Fold 4 , rmse score: 0.3915142785400578
mean rmse 0.4057895984993133
Fold 0 , rmse score: 0.2921394718315368
Fold 1 , rmse score: 0.27957332343442026
Fold 2 , rmse score: 0.28540387014746454
Fold 3 , rmse score: 0.45252763248661115
Fold 4 , rmse score: 0.29332989341794413
mean rmse 0.3205948382635954
Fold 0 , rmse score: 0.39438701377721463
Fold 1 , rmse score: 0.422104866

In [23]:
# 平均分配各模型权重
predictions = (model1_predictions + model2_predictions + model3_predictions + model4_predictions + model5_predictions) / 5
predictions

array([-0.41964708, -0.49231511, -0.41502767, -2.49319146, -1.84745904,
       -1.2765338 ,  0.14622188])

In [24]:
submission_df.target = predictions
submission_df

Unnamed: 0,id,target
0,c0f722661,-0.419647
1,f0953f0a5,-0.492315
2,0df072751,-0.415028
3,04caf4e0c,-2.493191
4,0e63f8bea,-1.847459
5,12537fe78,-1.276534
6,965e592c0,0.146222


In [25]:
# 将结果输出到提交文件
submission_df.to_csv("submission.csv", index=False)