### Log
* lgbm

In [1]:
from typing import List
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import shutil
import time
import gc
import random
import math
import torch
from torch.utils.data import DataLoader, Dataset
import logging
import transformers
from transformers import TrainingArguments, Trainer, DataCollatorForWholeWordMask
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs  import BaseModelOutput,SequenceClassifierOutput
from torch import nn
from torch.optim import Adam, SGD, AdamW
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup, DataCollatorWithPadding
from datasets import load_metric, disable_progress_bar
import datasets
# imports the torch_xla package
import wandb
from torch.nn.parameter import Parameter
#os.environ["WANDB_DISABLED"] = "true"
from tqdm import tqdm
from spellchecker import SpellChecker
import re
from autocorrect import Speller
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import xgboost
import warnings
from dataclasses import dataclass, field
from typing import Optional
import lightgbm as lgb

warnings.simplefilter('ignore')
logging.disable(logging.ERROR);
os.environ['TOKENIZER_PARALLELISM'] = 'false'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [2]:
import spellchecker
spellchecker.__version__

'0.7.2'

In [3]:
# set random seed
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    
seed_everything(seed=42)

## Config

In [4]:
class CFG:
    pretraining = False
    load_pretrained = False
    input_path = './input'
    input_type = '2'
    model_path = 'microsoft/deberta-v3-large' #  nghuyong/ernie-2.0-large-en studio-ousia/luke-large
    model_type = 'custom'
    scheduler = 'cosine'  # ['linear', 'cosine']
    batch_scheduler = True
    num_cycles = 0.5  # 1.5
    num_warmup_steps = 0
    max_input_length = 1600
    max_position_embeddings = 1600
    n_folds = 4
    folds = [1]
    epochs = 4  # 5
    # layer - wise larning rate 
    discriminative_learning_rate = False
    discriminative_learning_rate_num_groups = 1
    discriminative_learning_rate_decay_rate = 0.99
    # reinint layer
    reinit_layers = 0
    
#     encoder_lr = 5e-6
#     head_lr = 5e-6
    encoder_lr = 20e-6
    head_lr = 10e-5
    
    min_lr = 1e-7
    eps = 1e-7
    betas = (0.9, 0.999)
    weight_decay = 0
    dropout = 0
    num_fold = 5
    batch_size = 8
    seed = 42
    OUTPUT_DIR = './pretrain/'
    num_workers = 2
    device='cuda'
    print_freq = 100
    run_deberta_model = True;

    
@dataclass
class Config:
    model_name_or_path: Optional[str] = field(
        default="microsoft/deberta-v3-base",
        metadata={"help": "Model name or path"},
    )

    data_dir: Optional[str] = field(
        default="/kaggle/input/commonlit-evaluate-student-summaries",
        metadata={"help": "Data directory"},
    )

    max_seq_length: Optional[int] = field(
        default=CFG.max_input_length,
        metadata={"help": "Max sequence length"},
    )

    add_prompt_question: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt question into input"},
    )

    add_prompt_text: Optional[bool] = field(
        default=False,
        metadata={"help": "Add prompt text into input"},
    )

    fold: Optional[int] = field(
        default=0,
        metadata={"help": "Fold"},
    )

    num_proc: Optional[int] = field(
        default=4,
        metadata={"help": "Number of processes"},
    )

    dropout: Optional[float] = field(
        default=0.,
        metadata={"help": "Amount of dropout to apply"},
    )
    max_position_embeddings: Optional[int] = field(
        default=CFG.max_input_length,
        metadata={"help": "Amount of dropout to apply"},
    )


def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

In [5]:
import nltk
nltk.data.path.append('/root/workspace/commonlit/nltk')

In [6]:
class Preprocessor:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(CFG.model_path);
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        # input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
    
preprocessor = Preprocessor()   

### Read data and Create Folds

In [7]:
pdf = pd.read_csv(f"{CFG.input_path}/prompts_train.csv")
sdf = pd.read_csv(f"{CFG.input_path}/summaries_train.csv")

#df = pdf.merge(sdf, on="prompt_id")

#train = preprocessor.run(pdf, sdf, mode="train")
train = preprocessor.run(pdf, sdf, mode='test')


# 4 prompt ids, 4 folds
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

train["fold"] = train["prompt_id"].map(id2fold)
#test = preprocessor.run(prompts_test, summaries_test, mode="test")

#test.head()

100%|██████████| 7165/7165 [09:03<00:00, 13.18it/s]
100%|██████████| 7165/7165 [00:01<00:00, 5775.34it/s]
100%|██████████| 7165/7165 [00:01<00:00, 5421.21it/s]
100%|██████████| 7165/7165 [00:02<00:00, 3476.68it/s]
100%|██████████| 7165/7165 [00:02<00:00, 2975.87it/s]
100%|██████████| 7165/7165 [00:00<00:00, 56963.09it/s]


## Deberta Model

In [8]:
def tokenize_inf(example, tokenizer, config):
    sep = tokenizer.sep_token;
    prompt = sep.join([example["prompt_title"], example["prompt_text"], example["prompt_question"]])
    tokenized = tokenizer(
        example["text"],
        prompt,
        padding=False,
        truncation=True,
        max_length=config.max_seq_length,
    )

    return {
        **tokenized
    }

model_paths = [
    '/root/autodl-tmp/output_fold0_seed42_0810',
    '/root/autodl-tmp/output_fold1_seed42_0810',
    '/root/autodl-tmp/output_fold2_seed42_0810',
    '/root/autodl-tmp/output_fold3_seed42_0810'
]
tokenizer = AutoTokenizer.from_pretrained(model_paths[0]);
model_config = AutoConfig.from_pretrained(model_paths[0]);
model_config.update({
    'hidden_dropout_prob': 0,
    'attention_probs_dropout_prob': 0,
    'num_labels':2,
    'problem_type': 'regression',
    'max_position_embeddings': 1600,
});

data_collator = DataCollatorWithPadding(
    tokenizer = tokenizer,
    pad_to_multiple_of=16,
)

# Do not use pretrained model
deberta_models = []
for model_path in model_paths:
    model = AutoModelForSequenceClassification.from_pretrained(model_path, config = model_config);
    deberta_models.append(model)
config = Config()

### Training

In [9]:
from transformers import TrainingArguments 


targets = ["content", "wording"]
drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text", 'fixed_summary_text'
               ] + targets

lgb_models = []

for fold in range(0, CFG.n_folds):
    ## Run deberta model
    if CFG.run_deberta_model:
            
        X_train_cv = train[train["fold"] != fold]
        X_eval_cv = train[train["fold"] == fold]

        ## Create dataset for deberta
        X_train_ds = datasets.Dataset.from_pandas(X_train_cv);
        tokenized_X_train_ds = X_train_ds.map(tokenize_inf, 
                                              batched = False, 
                                              num_proc = 4,
                                              fn_kwargs= {'tokenizer': tokenizer, 'config': config}
                                             );
        X_eval_ds = datasets.Dataset.from_pandas(X_eval_cv);
        tokenized_X_eval_ds = X_eval_ds.map(tokenize_inf,
                                           batched = False,
                                           num_proc = 4,
                                           fn_kwargs = {'tokenizer': tokenizer, 'config': config}
                                           );
        deberta = deberta_models[fold]

        infer_args = TrainingArguments(
            output_dir = './',
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 16,   
            dataloader_drop_last = False,
            eval_accumulation_steps=1,
        );

        # Init deberta predictor
        trainer = Trainer(
            model = deberta,
            args = infer_args,
            data_collator = data_collator,
            tokenizer = tokenizer
        )
        print(f'Run deberta model for fold {fold}')
        deberta_res_train = trainer.predict(tokenized_X_train_ds)[0]
        deberta_res_eval = trainer.predict(tokenized_X_eval_ds)[0]

        X_train_cv['pred_content_score'] = deberta_res_train[:,0]
        X_train_cv['pred_wording_score'] = deberta_res_train[:,1]
    
        X_eval_cv['pred_content_score'] = deberta_res_eval[:, 0]
        X_eval_cv['pred_wording_score'] = deberta_res_eval[:, 1]
        
        ## Output the files
        X_train_cv.to_csv(f'./boost_input/X_train_cv_{fold}.csv', index = False);
        X_eval_cv.to_csv(f'./boost_input/X_eval_cv_{fold}.csv', index = False);
    else:
        X_train_cv = pd.read_csv(f'./boost_input/X_train_cv_{fold}.csv')
        X_eval_cv = pd.read_csv(f'./boost_input/X_eval_cv_{fold}.csv');
    
    
    ## Train xgboost
    evaluation_results = {}
    eval_labels = {}
    eval_preds = {}
    lgb_model_pair = {}
    # Train lightgbm
#     params = {
#         'boosting_type': 'gbdt',
#         'random_state': 42,
#         'objective': 'regression',
#         'metric': 'rmse',
#         'learning_rate': 0.05,
#     }
    
    params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.040,
            'max_depth': 4,  # 3
            'lambda_l1': 0.0,
            'lambda_l2': 0.011
        }
    for target in targets:
        y_train_cv = train[train["fold"] != fold][target]
        y_eval_cv = train[train["fold"] == fold][target]
        
        #Create lgb dataset
        dtrain = lgb.Dataset(X_train_cv.drop(columns=drop_columns), label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv.drop(columns=drop_columns), label=y_eval_cv)
        
        lgb_model = lgb.train(
            params,
            num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        lgb_model_pair[target] = lgb_model
        ## save model
        lgb_model.save_model(f'./lgbt_{target}_{fold}.txt')
        eval_preds[target] = lgb_model.predict(X_eval_cv.drop(columns=drop_columns).values)
        eval_labels[target] = y_eval_cv
    ## Get compute_mcrmse
    preds_arr = np.concatenate([
        eval_preds['content'].reshape(-1, 1),
        eval_preds['wording'].reshape(-1, 1)],
        axis = 1
    )
    preds_arr_deberta = np.concatenate([
        X_eval_cv.loc[:, 'pred_content_score'].values.reshape(-1, 1),
        X_eval_cv.loc[:, 'pred_wording_score'].values.reshape(-1, 1)
    ], axis = 1)
    
    labels_arr = np.concatenate([
        train.loc[train["fold"] == fold, 'content'].values.reshape(-1, 1),
        train.loc[train['fold'] == fold, 'wording'].values.reshape(-1, 1)],
        axis = 1
    )
    
    print(f'deberta compute_mcrmse: ')
    print(compute_mcrmse([preds_arr_deberta, labels_arr]))
    print(f'Print compute_mcrmse for fold {fold}')
    print(compute_mcrmse([preds_arr, labels_arr]))
    print()

Run deberta model for fold 0


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1684
[LightGBM] [Info] Number of data points in the train set: 6062, number of used features: 11
[LightGBM] [Info] Start training from score -0.044904
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.53324
Early stopping, best iteration is:
[131]	train's rmse: 0.530962
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1684
[LightGBM] [Info] Number of data points in the train set: 6062, number of used features: 11
[LightGBM] [Info] Start training from score -0.168933
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.701341
Early stopping, best iteration is:
[121]	train's rmse: 0.699898
deberta c

Run deberta model for fold 1


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000309 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1682
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 11
[LightGBM] [Info] Start training from score 0.017606
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.376967
Early stopping, best iteration is:
[127]	train's rmse: 0.37637
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000320 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1682
[LightGBM] [Info] Number of data points in the train set: 5108, number of used features: 11
[LightGBM] [Info] Start training from score -0.031791
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[57]	train's rmse: 0.562775
deberta compute_mcrmse: 
{'content_rmse'

Run deberta model for fold 2


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000338 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 11
[LightGBM] [Info] Start training from score -0.039959
Training until validation scores don't improve for 30 rounds
[100]	train's rmse: 0.423289
Early stopping, best iteration is:
[72]	train's rmse: 0.42071
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000282 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1644
[LightGBM] [Info] Number of data points in the train set: 5156, number of used features: 11
[LightGBM] [Info] Start training from score -0.060941
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[55]	train's rmse: 0.577233
deberta compute_mcrmse: 
{'content_rmse'

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000281 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 11
[LightGBM] [Info] Start training from score 0.013356
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[59]	train's rmse: 0.393759
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1634
[LightGBM] [Info] Number of data points in the train set: 5169, number of used features: 11
[LightGBM] [Info] Start training from score 0.028040
Training until validation scores don't improve for 30 rounds
Early stopping, best iteration is:
[58]	train's rmse: 0.470119
deberta compute_mcrmse: 
{'content_rmse': 0.3997122942162671, 'wording

In [None]:
X_train_cv.drop(columns=drop_columns).head(3)

In [None]:
X_train_cv.head(3)

In [None]:
eval_labels = {}
eval_preds = {}
xgboost_pair = {}
for target in targets:
    y_train_cv = train[train["fold"] != fold][target]
    y_eval_cv = train[train["fold"] == fold][target]

    xgboost_model = XGBRegressor(
            n_estimators=1000, 
            max_depth=7, 
            eta=0.001, 
            subsample=0.7, 
            colsample_bytree=0.8)
    xgboost_model.fit(
        X_train_cv.drop(columns=drop_columns).values,
        y_train_cv.values.reshape((-1, 1)) 
    )

    xgboost_pair[target] = xgboost_model
    ## save model
    xgboost_model.save_model(f'./boost_input/xgboost_{target}_{fold}.txt')
    eval_preds[target] = xgboost_model.predict(X_eval_cv.drop(columns=drop_columns).values)
    eval_labels[target] = y_eval_cv
## Get compute_mcrmse
preds_arr = np.concatenate([
    eval_preds['content'].reshape(-1, 1),
    eval_preds['wording'].reshape(-1, 1)],
    axis = 1
)
preds_arr_deberta = np.concatenate([
    X_eval_cv.loc[:, 'pred_content_score'].values.reshape(-1, 1),
    X_eval_cv.loc[:, 'pred_wording_score'].values.reshape(-1, 1)
], axis = 1)

labels_arr = np.concatenate([
    train.loc[train["fold"] == fold, 'content'].values.reshape(-1, 1),
    train.loc[train['fold'] == fold, 'wording'].values.reshape(-1, 1)],
    axis = 1
)

print(f'deberta compute_mcrmse: ')
print(compute_mcrmse([preds_arr_deberta, labels_arr]))
print(f'Print compute_mcrmse for fold {fold}')
print(compute_mcrmse([preds_arr, labels_arr]))
print()

## Obsolete

In [None]:
eval_labels_dict = {}
for target in targets:
    evaluation_results[target] = lgb_model_pair[target].predict(X_eval_cv.drop(columns=drop_columns))
    eval_labels_dict[target] = train[train["fold"] == fold][target]

In [None]:
eval_preds = np.concatenate([
    evaluation_results['content'].reshape(-1, 1),
    evaluation_results['wording'].reshape(-1, 1)],
    axis = 1)
eval_labels = np.concatenate([
    train.loc[train["fold"] == fold, 'content'].values.reshape(-1, 1),
    train.loc[train['fold'] == fold, 'wording'].values.reshape(-1, 1)],
    axis = 1
)

In [None]:
compute_mcrmse([eval_preds, eval_labels])


In [None]:
# train_ds = datasets.Dataset.from_pandas(train)
# tokenized_train_ds = train_ds.map(tokenize_inf, batched = False, num_proc = 4, fn_kwargs= {'tokenizer': tokenizer, 'config': config})

In [None]:
from transformers import TrainingArguments
infer_args = TrainingArguments(
    output_dir = './',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 16,   
    dataloader_drop_last = False,
    eval_accumulation_steps=1,
)

# init trainer
trainer = Trainer(
    model = model, 
    args = infer_args,
    data_collator = data_collator,
    tokenizer = tokenizer
)

deberta_res = []
for model in models:
    deberta_res.append(trainer.predict(tokenized_train_ds)[0])

In [None]:
deberta_res_mean = np.mean(np.array(deberta_res), axis = 0)

In [None]:
train['pred_content_score'] = deberta_res_mean[:,0]
train['pred_wording_score'] = deberta_res_mean[:,1]
#train.to_csv('./train_2.csv', index = False)

In [None]:
#train = pd.read_csv('./train_2.csv')

In [None]:
def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }


mcrmse = compute_mcrmse((train.loc[:, ['pred_content_score', 'pred_wording_score']].values, 
               train.loc[:, ['content', 'wording']].values))
mcrmse

In [None]:
train.loc[:, ['pred_content_score', 'pred_wording_score', 'content', 'wording']].sample(5)

### Lightgbm Model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", 
                "prompt_question", "prompt_title", 
                "prompt_text"
               ] + targets

In [None]:
lgb_model_dict = {}

for target in targets:
    lgb_models = []
    
    #for fold in range(CFG.n_folds):
    for fold in range(3,4):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
                  'boosting_type': 'gbdt',
                  'random_state': 42,
                  'objective': 'regression',
                  'metric': 'rmse',
                  'learning_rate': 0.05,
                  }

        evaluation_results = {}
        lgb_model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                               lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        lgb_model.save_model(f'./lgbt_{target}_{fold}.txt')
        lgb_models.append(lgb_model)
    
    lgb_model_dict[target] = lgb_models

In [None]:
for target in targets:
    lgb_model_dict[target].save_model(f'./lgbt_{target}_{idx}.txt')

## CV score

In [None]:
### test load
model_targets = ['content', 'wording']

lgb_model_dict = {}
for model_target in model_targets:
    lgb_model_dict[model_target] = [];
    for idx in range(4):
        lgb_model_dict[model_target].append(lgb.Booster(model_file = f'./lgbt_{model_target}_{idx}.txt'))

In [None]:
preds_content = lgb_model_dict['content'][0].predict(train[train['fold'] == 3].drop(columns = drop_columns))
wording_content = lgb_model_dict['wording'][0].predict(train[train['fold'] == 3].drop(columns = drop_columns))

In [None]:
compute_mcrmse(
    (np.concatenate([preds_content.reshape(-1, 1), wording_content.reshape(-1, 1)], axis = 1),
    train.loc[train['fold'] == 3, ['content', 'wording']])
)

In [None]:
from sklearn.metrics import mean_squared_error
# cv
rmses = []

for target in targets:
    lgb_models = lgb_model_dict[target]

    preds = []
    trues = []
    
    for fold, lgb_model in enumerate(lgb_models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = lgb_model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

### Check mcrmse on training data

In [None]:
lgb_model_content_0 = lgb_model_dict['content'][0]
lgb_model_wording_0 = lgb_model_dict['wording'][0]

In [None]:
train_content_res = lgb_model_content_0.predict(train.drop(columns = drop_columns)).reshape(-1,1)
train_wording_res = lgb_model_wording_0.predict(train.drop(columns = drop_columns)).reshape(-1,1)
preds = np.concatenate([train_content_res, train_wording_res], axis = 1)
preds

In [None]:
labels = train.loc[:, ['content', 'wording']]
compute_mcrmse([preds, labels])