In [None]:
!touch submission.csv

In [None]:
import sys
sys.path.append("/kaggle/input/pip-install-nlp-mit")

In [None]:
!pip install "/kaggle/input/worddifficulty/py_readability_metrics-1.4.5-py3-none-any.whl"

In [None]:
# !pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
!pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
class CFG:
    model_name = "another-bert"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    
    random_seed=42
    save_steps=20
    max_length=512

In [None]:
MODEL_DIR = '/kaggle/input/commitlit-deberta-v3-large-trained3'
INIT_MODEL = f"{MODEL_DIR}/content/{CFG.model_name}/fold_0"

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker

In [None]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [None]:
summaries_train.head()

In [None]:
import pickle
with open(MODEL_DIR+'/pickled.pkl', 'rb') as f:
    train = pickle.load(f)
    test = pickle.load(f)

# train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

# train.head()

In [None]:
# train

In [None]:
train = train.drop(columns = ['length_ratio'])
test = test.drop(columns = ['length_ratio'])

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

## Model Function Definition

In [None]:
import shutil

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [None]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.model_config = AutoConfig.from_pretrained(INIT_MODEL)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config,
            ignore_mismatched_sizes=True
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        shutil.rmtree(self.model_dir)
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        model_fold_dir = "valid_log" #f"bert-{fold}"
#         print("model_fold_dir", model_fold_dir)
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [None]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"
        
#         print(model_dir, model_name)
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [None]:
for target in ["content", "wording"]:
#     train_by_fold(
#         train,
#         model_name=CFG.model_name,
#         save_each_model=True,
#         target=target,
#         learning_rate=CFG.learning_rate,
#         hidden_dropout_prob=CFG.hidden_dropout_prob,
#         attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#         weight_decay=CFG.weight_decay,
#         num_train_epochs=CFG.num_train_epochs,
#         n_splits=CFG.n_splits,
#         batch_size=CFG.batch_size,
#         save_steps=CFG.save_steps,
#         max_length=CFG.max_length
#     )
    
    print("[validate]")
    train = validate(
        train,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    print("[test]")
    test = predict(
        test,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

In [None]:
# !rm -r wording content

In [None]:
train.head()

# Add Features

In [None]:
wd = pd.read_csv('/kaggle/input/worddifficulty/WordDifficulty.csv')
dic = dict(zip(wd['Word'], wd['I_Zscore']))

In [None]:
def difficulty(data) :
    words = word_tokenize(data['text'])
#     s = ['``','\'\'','.',',']
#     stop_words = set(stopwords.words('english') + s)
#     filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_words = words
    score = 0
    num = 0
    sep = 0
    for e in filtered_words:
        if e in dic:
            score += dic[e]
            num+=1
        elif e == '.' or e == ',' :
            sep+=1
        else:
            pass
#             print(e,"**")

    nn = max(1, len(filtered_words))
    sep = max(1, sep)
    num = max(1, num)
    return score/num, score/nn, nn/sep, (nn-num)/nn



#         prompts["prompt_tokens"] = prompts["prompt_text"].apply(
#             lambda x: word_tokenize(x)
#         )
labels = ['difficulty0', 'difficulty1', 'ave_text_len', 'unknown_words']
train[labels]=train.apply(lambda x:difficulty(x),axis=1, result_type='expand')
test[labels]=test.apply(lambda x:difficulty(x),axis=1, result_type='expand')

In [None]:
from readability import Readability

In [None]:
no_score = -10000
def rscore(data) :
    txt = data['text']
    words = word_tokenize(txt)
    n = len(words) + 1
    if n == 0 : 
        return no_score, no_score, no_score, no_score, no_score, no_score, no_score, no_score
    tot = n
    new = txt
    while tot < 200 :
        new += " " + txt
        tot += n
    r = Readability(new)
    try :
        ret = (r.flesch_kincaid().score, r.flesch().score, r.gunning_fog().score,
               r.coleman_liau().score,r.dale_chall().score, r.ari().score,
               r.linsear_write().score, r.spache().score)
    except:
        return no_score, no_score, no_score, no_score, no_score, no_score, no_score, no_score
    return ret

labels = ['flesch_kincaid', 'flesch', 'gunning_fog', 'coleman_liau',
         'dale_chall','ari','linsear_write','spache']

train[labels]=train.progress_apply(lambda x:rscore(x),axis=1, result_type='expand')
test[labels]=test.progress_apply(lambda x:rscore(x),axis=1, result_type='expand')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

dic = dict()
for i in range(len(prompts_train)) :
    pid = prompts_train['prompt_id'][i]
    txt = prompts_train['prompt_text'][i]
    original_tokens = nltk.word_tokenize(txt)
    original_text = ' '.join(original_tokens)
    dic[pid] = original_text
for i in range(len(prompts_test)) :
    pid = prompts_test['prompt_id'][i]
    txt = prompts_test['prompt_text'][i]
    original_tokens = nltk.word_tokenize(txt)
    original_text = ' '.join(original_tokens)
    dic[pid] = original_text
    
dic.keys()

def cosine_sim(data):
    original_text = dic[data['prompt_id']]
    summary = data['fixed_summary_text']
#     original_tokens = nltk.word_tokenize(original_text)
    summary_tokens = nltk.word_tokenize(summary)

    # トークンを結合して文に戻す
#     original_text = ' '.join(original_tokens)
    summary = ' '.join(summary_tokens)

    # CountVectorizerを使用して文をベクトル化
    vectorizer = CountVectorizer().fit_transform([original_text, summary])

    # コサイン類似度を計算
    cosine_scores = cosine_similarity(vectorizer)

    # 要約と元の文章の類似度を表示
    similarity_score = cosine_scores[0][1]
    
    return similarity_score

In [None]:
# cosine_sim(train.iloc[0])
train['cos_sim']=train.progress_apply(lambda x:cosine_sim(x),axis=1, result_type='expand')
test['cos_sim']=test.progress_apply(lambda x:cosine_sim(x),axis=1, result_type='expand')

In [None]:
import textstat
def txts(data):
    text = data['text']
    return (
        textstat.flesch_reading_ease(text),
        textstat.flesch_kincaid_grade(text),
        textstat.gunning_fog(text),
        textstat.smog_index(text),
        textstat.automated_readability_index(text),
        textstat.coleman_liau_index(text),
        textstat.linsear_write_formula(text),
        textstat.dale_chall_readability_score(text),
        textstat.text_standard(text, float_output=True),
        textstat.reading_time(text, ms_per_char=14.69),
        textstat.syllable_count(text),
        textstat.lexicon_count(text, removepunct=True),
        textstat.sentence_count(text),
        textstat.char_count(text, ignore_spaces=True),
        textstat.letter_count(text, ignore_spaces=True),
        textstat.monosyllabcount(text)
    )

sample = txts(train.iloc[1])
print(sample)
labels = [f'f{i}' for i in range(len(sample))]
train[labels]=train.progress_apply(lambda x:txts(x),axis=1, result_type='expand')
test[labels]=test.progress_apply(lambda x:txts(x),axis=1, result_type='expand')

In [None]:
train

In [None]:
train.to_csv("train0.csv", index=False)

## LGBM model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text"
               ] + targets

In [None]:
train[train["fold"] != 0].drop(columns=drop_columns)

In [None]:
# import optuna
# def objective(trial):
    
#     model_dict = {}

#     for target in targets:
#         models = []

#         for fold in range(CFG.n_splits):

#             X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#             y_train_cv = train[train["fold"] != fold][target]

#             X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#             y_eval_cv = train[train["fold"] == fold][target]

#             dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
#             dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

#             params = {
#                 'boosting_type': 'gbdt',
#                 'random_state': 42,
#                 'objective': 'regression',
#                 'metric': 'rmse',
#                 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.2),
#                 'max_depth': trial.suggest_int('max_depth', 3, 10),
#                 'lambda_l1': trial.suggest_uniform('lambda_l1', 0, 0.01),
#                 'lambda_l2': trial.suggest_uniform('lambda_l2', 0, 0.10),
#                 'num_leaves': trial.suggest_int('num_leaves', 16, 64),
#                 'verbosity': -1
#             }

#             evaluation_results = {}
#             model = lgb.train(params,
#                               num_boost_round=10000,
#                                 #categorical_feature = categorical_features,
#                               valid_names=['train', 'valid'],
#                               train_set=dtrain,
#                               valid_sets=dval,
#                               callbacks=[
#                                   lgb.early_stopping(stopping_rounds=30, verbose=False),
# #                                   lgb.log_evaluation(100),
# #                                   lgb.callback.record_evaluation(evaluation_results)
#                                 ],
#                               )
#             models.append(model)

#         model_dict[target] = models

#         rmses = []

#     for target in targets:
#         models = model_dict[target]

#         preds = []
#         trues = []

#         for fold, model in enumerate(models):
#             X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#             y_eval_cv = train[train["fold"] == fold][target]

#             pred = model.predict(X_eval_cv)

#             trues.extend(y_eval_cv)
#             preds.extend(pred)

#         rmse = np.sqrt(mean_squared_error(trues, preds))
# #         print(f"{target}_rmse : {rmse}")
#         rmses = rmses + [rmse]

#     print(f"mcrmse : {sum(rmses) / len(rmses)}")
#     return sum(rmses) / len(rmses)
        

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048,
            'max_depth': 4,
            'lambda_l1': 0.001,
            'lambda_l2': 0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                              lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

## CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test

## Create Submission file

In [None]:
sample_submission

In [None]:
test[["student_id", "content", "wording"]]

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

## Summary

CV result is like this.

| | content rmse |wording rmse | mcrmse | LB| |
| -- | -- | -- | -- | -- | -- |
|baseline| 0.494 | 0.630 | 0.562 | 0.509 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-baseline-content-and-wording-models)|
| use title and question field | 0.476| 0.619 | 0.548 | 0.508 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-w-prompt-title-question-fields) |
| Debertav3 + LGBM | 0.451 | 0.591 | 0.521 | 0.461 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-with-feature-engineering) |
| Debertav3 + LGBM with spell autocorrect | 0.448 | 0.581 | 0.514 | 0.459 |nogawanogawa's original code
| Debertav3 + LGBM with spell autocorrect and tuning | 0.442 | 0.566 | 0.504 | 0.453 | this notebook |

The CV values improved slightly, and the LB value is improved.

In [None]:
test_zero = test

In [None]:
#  [code] {"jupyter":{"outputs_hidden":false}}

In [None]:
# import sys
# sys.path.append("/kaggle/input/pip-install-nlp-mit")

In [None]:
# !pip install "/kaggle/input/worddifficulty/py_readability_metrics-1.4.5-py3-none-any.whl"

# 1st MODEL

In [None]:
# !pip install "/kaggle/input/autocorrect/autocorrect-2.6.1.tar"
# !pip install "/kaggle/input/pyspellchecker/pyspellchecker-0.7.2-py3-none-any.whl"

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
class CFG:
    model_name = "another-bert"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    
    random_seed=42
    save_steps=20
    max_length=512

In [None]:
MODEL_DIR = '/kaggle/input/commitlit-deberta-v3-large-trained'
INIT_MODEL = f"{MODEL_DIR}/content/{CFG.model_name}/fold_0"

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Meta Data Cleansing

In [None]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
import logging

# Initialize logging
logging.basicConfig(level=logging.INFO)

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

class FeatureEngineering:
    
    def __init__(self, df):
        self.df = df
        self.df['grade'].fillna(0, inplace=True)  # Fill NA values in 'grade' with 0

    def classify_author(self, author):
        doc = nlp(author)
        for ent in doc.ents:
            if ent.label_ == 'PERSON':
                return 'person'
        return 'org'

    def encode_author_type(self):
        self.df['author_type'] = self.df['author'].apply(self.classify_author)
        le = LabelEncoder()
        self.df['author_type'] = le.fit_transform(self.df['author_type'])

    def frequency_encoding(self):
        logging.info("Applying Frequency Encoding on 'author'")
        self.df['author_frequency'] = self.df['author'].map(self.df['author'].value_counts())

    def one_hot_encoding(self):
        logging.info("Applying One-Hot Encoding on 'genre'")
        onehot_encoder = OneHotEncoder(sparse=False)
        genre_onehot = onehot_encoder.fit_transform(self.df[['genre']])
        df_onehot = pd.DataFrame(genre_onehot, columns=onehot_encoder.get_feature_names_out(['genre']))
        self.df = pd.concat([self.df, df_onehot], axis=1)

    def feature_scaling(self):
        logging.info("Applying Feature Scaling on 'lexile'")
        scaler = StandardScaler()
        self.df['lexile_scaled'] = scaler.fit_transform(self.df[['lexile']])

    def transform(self):
        self.encode_author_type()
        self.frequency_encoding()
#         self.one_hot_encoding()
        self.feature_scaling()
        return self.df

# Initialize FeatureEngineering class and apply transformations
prompt_grade = pd.read_csv(r'/kaggle/input/commonlit-texts/commonlit_texts.csv')
feature_engineer = FeatureEngineering(prompt_grade)
transformed_df = feature_engineer.transform()

# Display the transformed DataFrame
prompt_grade = transformed_df

In [None]:
keep_columns = ['title','author','description','grade','genre','lexile','lexile_scaled','is_prose','author_type','author_frequency']
prompt_grade = prompt_grade[keep_columns]

def preprocess_and_join(df1, df2, df1_title_col, df2_title_col, grade_col):
    # Copy dataframes to avoid modifying the originals
    df1 = df1.copy()
    df2 = df2.copy()

    # Preprocess titles
    df1[df1_title_col] = df1[df1_title_col].str.replace('"', '').str.strip()
    df2[df2_title_col] = df2[df2_title_col].str.replace('"', '').str.strip()

    # Remove duplicate grades
    df2 = df2.drop_duplicates(subset=df2_title_col, keep='first')

    # Join dataframes
    merged_df = df1.merge(df2, how='left', left_on=df1_title_col, right_on=df2_title_col)
    

    # Postprocess grades
    merged_df[grade_col] = merged_df[grade_col].fillna(0)
    merged_df[grade_col] = merged_df[grade_col].astype(int).astype('category')

 
    return merged_df

prompts_train = preprocess_and_join(prompts_train, prompt_grade, 'prompt_title', 'title', 'grade')
prompts_test = preprocess_and_join(prompts_test, prompt_grade, 'prompt_title', 'title', 'grade')

In [None]:
!pip install "/kaggle/input/pyphen-0100/Pyphen-0.10.0-py3-none-any.whl"

In [None]:
import pyphen
from nltk.sentiment import SentimentIntensityAnalyzer

dic = pyphen.Pyphen(lang='en')
sid = SentimentIntensityAnalyzer()

class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def calculate_text_similarity(self, row):
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([row['prompt_text'], row['text']])
        return cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
    
    def sentiment_analysis(self, text):
        analysis = TextBlob(text)
        return analysis.sentiment.polarity, analysis.sentiment.subjectivity
    
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def calculate_unique_words(self,text):
        unique_words = set(text.split())
        return len(unique_words)
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
        
    def calculate_pos_ratios(self , text):
        pos_tags = pos_tag(nltk.word_tokenize(text))
        pos_counts = Counter(tag for word, tag in pos_tags)
        total_words = len(pos_tags)
        ratios = {tag: count / total_words for tag, count in pos_counts.items()}
        return ratios
    
    def calculate_punctuation_ratios(self,text):
        total_chars = len(text)
        punctuation_counts = Counter(char for char in text if char in '.,!?;:"()[]{}')
        ratios = {char: count / total_chars for char, count in punctuation_counts.items()}
        return ratios
    
    def calculate_keyword_density(self,row):
        keywords = set(row['prompt_text'].split())
        text_words = row['text'].split()
        keyword_count = sum(1 for word in text_words if word in keywords)
        return keyword_count / len(text_words)
    
    def count_syllables(self,word):
        hyphenated_word = dic.inserted(word)
        return len(hyphenated_word.split('-'))

    def flesch_reading_ease_manual(self,text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        flesch_score = 206.835 - 1.015 * (total_words / total_sentences) - 84.6 * (total_syllables / total_words)
        return flesch_score
    
    def flesch_kincaid_grade_level(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        total_syllables = sum(self.count_syllables(word) for word in TextBlob(text).words)

        if total_sentences == 0 or total_words == 0:
            return 0

        fk_grade = 0.39 * (total_words / total_sentences) + 11.8 * (total_syllables / total_words) - 15.59
        return fk_grade
    
    def gunning_fog(self, text):
        total_sentences = len(TextBlob(text).sentences)
        total_words = len(TextBlob(text).words)
        complex_words = sum(1 for word in TextBlob(text).words if self.count_syllables(word) > 2)

        if total_sentences == 0 or total_words == 0:
            return 0

        fog_index = 0.4 * ((total_words / total_sentences) + 100 * (complex_words / total_words))
        return fog_index
    
    def calculate_sentiment_scores(self,text):
        sentiment_scores = sid.polarity_scores(text)
        return sentiment_scores
    
    def count_difficult_words(self, text, syllable_threshold=3):
        words = TextBlob(text).words
        difficult_words_count = sum(1 for word in words if self.count_syllables(word) >= syllable_threshold)
        return difficult_words_count


    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
        prompts['gunning_fog_prompt'] = prompts['prompt_text'].apply(self.gunning_fog)
        prompts['flesch_kincaid_grade_level_prompt'] = prompts['prompt_text'].apply(self.flesch_kincaid_grade_level)
        prompts['flesch_reading_ease_prompt'] = prompts['prompt_text'].apply(self.flesch_reading_ease_manual)

        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
#         summaries["fixed_summary_text"] = summaries["text"].progress_apply(
#             lambda x: self.speller(x)
#         )

        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")
        input_df['flesch_reading_ease'] = input_df['text'].apply(self.flesch_reading_ease_manual)
        input_df['word_count'] = input_df['text'].apply(lambda x: len(x.split()))
        input_df['sentence_length'] = input_df['text'].apply(lambda x: len(x.split('.')))
        input_df['vocabulary_richness'] = input_df['text'].apply(lambda x: len(set(x.split())))

        input_df['word_count2'] = [len(t.split(' ')) for t in input_df.text]
        input_df['num_unq_words']=[len(list(set(x.lower().split(' ')))) for x in input_df.text]
        input_df['num_chars']= [len(x) for x in input_df.text]

        # Additional features
        input_df['avg_word_length'] = input_df['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
        input_df['comma_count'] = input_df['text'].apply(lambda x: x.count(','))
        input_df['semicolon_count'] = input_df['text'].apply(lambda x: x.count(';'))

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']
        
        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        input_df['exclamation_count'] = input_df['text'].apply(lambda x: x.count('!'))
        input_df['question_count'] = input_df['text'].apply(lambda x: x.count('?'))
        input_df['pos_ratios'] = input_df['text'].apply(self.calculate_pos_ratios)

        # Convert the dictionary of POS ratios into a single value (mean)
        input_df['pos_mean'] = input_df['pos_ratios'].apply(lambda x: np.mean(list(x.values())))
        input_df['punctuation_ratios'] = input_df['text'].apply(self.calculate_punctuation_ratios)

        # Convert the dictionary of punctuation ratios into a single value (sum)
        input_df['punctuation_sum'] = input_df['punctuation_ratios'].apply(lambda x: np.sum(list(x.values())))
        input_df['keyword_density'] = input_df.apply(self.calculate_keyword_density, axis=1)
        input_df['jaccard_similarity'] = input_df.apply(lambda row: len(set(word_tokenize(row['prompt_text'])) & set(word_tokenize(row['text']))) / len(set(word_tokenize(row['prompt_text'])) | set(word_tokenize(row['text']))), axis=1)
        tqdm.pandas(desc="Performing Sentiment Analysis")
        input_df[['sentiment_polarity', 'sentiment_subjectivity']] = input_df['text'].progress_apply(
            lambda x: pd.Series(self.sentiment_analysis(x))
        )
        tqdm.pandas(desc="Calculating Text Similarity")
        input_df['text_similarity'] = input_df.progress_apply(self.calculate_text_similarity, axis=1)
        #Calculate sentiment scores for each row
        input_df['sentiment_scores'] = input_df['text'].apply(self.calculate_sentiment_scores)
        
        input_df['gunning_fog'] = input_df['text'].apply(self.gunning_fog)
        input_df['flesch_kincaid_grade_level'] = input_df['text'].apply(self.flesch_kincaid_grade_level)
        input_df['count_difficult_words'] = input_df['text'].apply(self.count_difficult_words)

        # Convert sentiment_scores into individual columns
        sentiment_columns = pd.DataFrame(list(input_df['sentiment_scores']))
        input_df = pd.concat([input_df, sentiment_columns], axis=1)
        input_df['sentiment_scores_prompt'] = input_df['prompt_text'].apply(self.calculate_sentiment_scores)
        # Convert sentiment_scores_prompt into individual columns
        sentiment_columns_prompt = pd.DataFrame(list(input_df['sentiment_scores_prompt']))
        sentiment_columns_prompt.columns = [col +'_prompt' for col in sentiment_columns_prompt.columns]
        input_df = pd.concat([input_df, sentiment_columns_prompt], axis=1)
        columns =  ['pos_ratios', 'sentiment_scores', 'punctuation_ratios', 'sentiment_scores_prompt']
        cols_to_drop = [col for col in columns if col in input_df.columns]
        if cols_to_drop:
            input_df = input_df.drop(columns=cols_to_drop)
        
        print(cols_to_drop)
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker

In [None]:
# class Preprocessor:
#     def __init__(self, 
#                 model_name: str,
#                 ) -> None:
#         self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
#         self.twd = TreebankWordDetokenizer()
#         self.STOP_WORDS = set(stopwords.words('english'))
        
#         self.spacy_ner_model = spacy.load('en_core_web_sm',)
#         self.speller = Speller(lang='en')
#         self.spellchecker = SpellChecker() 
        
#     def word_overlap_count(self, row):
#         """ intersection(prompt_text, text) """        
#         def check_is_stop_word(word):
#             return word in self.STOP_WORDS
        
#         prompt_words = row['prompt_tokens']
#         summary_words = row['summary_tokens']
#         if self.STOP_WORDS:
#             prompt_words = list(filter(check_is_stop_word, prompt_words))
#             summary_words = list(filter(check_is_stop_word, summary_words))
#         return len(set(prompt_words).intersection(set(summary_words)))
            
#     def ngrams(self, token, n):
#         # Use the zip function to help us generate n-grams
#         # Concatentate the tokens into ngrams and return
#         ngrams = zip(*[token[i:] for i in range(n)])
#         return [" ".join(ngram) for ngram in ngrams]

#     def ngram_co_occurrence(self, row, n: int) -> int:
#         # Tokenize the original text and summary into words
#         original_tokens = row['prompt_tokens']
#         summary_tokens = row['summary_tokens']

#         # Generate n-grams for the original text and summary
#         original_ngrams = set(self.ngrams(original_tokens, n))
#         summary_ngrams = set(self.ngrams(summary_tokens, n))

#         # Calculate the number of common n-grams
#         common_ngrams = original_ngrams.intersection(summary_ngrams)
#         return len(common_ngrams)
    
#     def ner_overlap_count(self, row, mode:str):
#         model = self.spacy_ner_model
#         def clean_ners(ner_list):
#             return set([(ner[0].lower(), ner[1]) for ner in ner_list])
#         prompt = model(row['prompt_text'])
#         summary = model(row['text'])

#         if "spacy" in str(model):
#             prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
#             summary_ner = set([(token.text, token.label_) for token in summary.ents])
#         elif "stanza" in str(model):
#             prompt_ner = set([(token.text, token.type) for token in prompt.ents])
#             summary_ner = set([(token.text, token.type) for token in summary.ents])
#         else:
#             raise Exception("Model not supported")

#         prompt_ner = clean_ners(prompt_ner)
#         summary_ner = clean_ners(summary_ner)

#         intersecting_ners = prompt_ner.intersection(summary_ner)
        
#         ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
#         if mode == "train":
#             return ner_dict
#         elif mode == "test":
#             return {key: ner_dict.get(key) for key in self.ner_keys}

    
#     def quotes_count(self, row):
#         summary = row['text']
#         text = row['prompt_text']
#         quotes_from_summary = re.findall(r'"([^"]*)"', summary)
#         if len(quotes_from_summary)>0:
#             return [quote in text for quote in quotes_from_summary].count(True)
#         else:
#             return 0

#     def spelling(self, text):
        
#         wordlist=text.split()
#         amount_miss = len(list(self.spellchecker.unknown(wordlist)))

#         return amount_miss
    
#     def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
#         """dictionary update for pyspell checker and autocorrect"""
#         self.spellchecker.word_frequency.load_words(tokens)
#         self.speller.nlp_data.update({token:1000 for token in tokens})
    
#     def run(self, 
#             prompts: pd.DataFrame,
#             summaries:pd.DataFrame,
#             mode:str
#         ) -> pd.DataFrame:
        
#         # before merge preprocess
#         prompts["prompt_length"] = prompts["prompt_text"].apply(
#             lambda x: len(word_tokenize(x))
#         )
#         prompts["prompt_tokens"] = prompts["prompt_text"].apply(
#             lambda x: word_tokenize(x)
#         )

#         summaries["summary_length"] = summaries["text"].apply(
#             lambda x: len(word_tokenize(x))
#         )
#         summaries["summary_tokens"] = summaries["text"].apply(
#             lambda x: word_tokenize(x)
#         )
        
#         # Add prompt tokens into spelling checker dictionary
#         prompts["prompt_tokens"].apply(
#             lambda x: self.add_spelling_dictionary(x)
#         )
        
# #         from IPython.core.debugger import Pdb; Pdb().set_trace()
#         # fix misspelling
#         summaries["fixed_summary_text"] = summaries["text"].progress_apply(
#             lambda x: self.speller(x)
#         )
        
#         # count misspelling
#         summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
#         # merge prompts and summaries
#         input_df = summaries.merge(prompts, how="left", on="prompt_id")

#         # after merge preprocess
#         input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

#         input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
#         input_df['bigram_overlap_count'] = input_df.progress_apply(
#             self.ngram_co_occurrence,args=(2,), axis=1 
#         )
#         input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
#         input_df['trigram_overlap_count'] = input_df.progress_apply(
#             self.ngram_co_occurrence, args=(3,), axis=1
#         )
#         input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
#         input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
#         return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
# preprocessor = Preprocessor(model_name=CFG.model_name)

In [None]:
summaries_train.head()

In [None]:
import pickle
with open(MODEL_DIR+'/pickled.pkl', 'rb') as f:
    train = pickle.load(f)
    test = pickle.load(f)

In [None]:
from textblob import TextBlob
from nltk import ne_chunk, word_tokenize, pos_tag
from sklearn.metrics.pairwise import cosine_similarity
train = preprocessor.run(prompts_train, summaries_train, mode="train")
test = preprocessor.run(prompts_test, summaries_test, mode="test")

train.head()

In [None]:
train_keep = train.copy()
test_keep = test.copy()

In [None]:
# train = train[:256]

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

## Model Function Definition

In [None]:
import shutil

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [None]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.model_config = AutoConfig.from_pretrained(INIT_MODEL)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["fixed_summary_text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config,
            ignore_mismatched_sizes=True
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        shutil.rmtree(self.model_dir)
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        model_fold_dir = "valid_log" #f"bert-{fold}"
#         print("model_fold_dir", model_fold_dir)
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [None]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"
        
#         print(model_dir, model_name)
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [None]:
for target in ["content", "wording"]:
#     train_by_fold(
#         train,
#         model_name=CFG.model_name,
#         save_each_model=True,
#         target=target,
#         learning_rate=CFG.learning_rate,
#         hidden_dropout_prob=CFG.hidden_dropout_prob,
#         attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#         weight_decay=CFG.weight_decay,
#         num_train_epochs=CFG.num_train_epochs,
#         n_splits=CFG.n_splits,
#         batch_size=CFG.batch_size,
#         save_steps=CFG.save_steps,
#         max_length=CFG.max_length
#     )
    
    print("[validate]")
    train = validate(
        train,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    print("[test]")
    test = predict(
        test,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

In [None]:
# !rm -r wording content

In [None]:
train.head()

In [None]:
train_1st = train
test_1st = test

## LGBM model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre", "grade"
               ] + targets

In [None]:
train[train["fold"] != 0].drop(columns=drop_columns)

In [None]:
# model_dict = {}

# for target in targets:
#     models = []
    
#     for fold in range(CFG.n_splits):

#         X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#         y_train_cv = train[train["fold"] != fold][target]

#         X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#         y_eval_cv = train[train["fold"] == fold][target]

#         dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
#         dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

#         params = {
#             'boosting_type': 'gbdt',
#             'random_state': 42,
#             'objective': 'regression',
#             'metric': 'rmse',
#             'learning_rate': 0.048,
#             'max_depth': 4,
#             'lambda_l1': 0.001,
#             'lambda_l2': 0.011
#         }

#         evaluation_results = {}
#         model = lgb.train(params,
#                           num_boost_round=10000,
#                             #categorical_feature = categorical_features,
#                           valid_names=['train', 'valid'],
#                           train_set=dtrain,
#                           valid_sets=dval,
#                           callbacks=[
#                               lgb.early_stopping(stopping_rounds=30, verbose=True),
#                               lgb.log_evaluation(100),
#                               lgb.callback.record_evaluation(evaluation_results)
#                             ],
#                           )
#         models.append(model)
    
#     model_dict[target] = models

In [None]:
from catboost import CatBoostRegressor


model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        model = CatBoostRegressor(
            learning_rate = 0.048,
            depth = 4,
            min_data_in_leaf = 34,
            iterations = 10000,
            early_stopping_rounds = 300,
            task_type ='CPU',
            loss_function ='RMSE'
          )
        model.fit(X_train_cv, 
                  y_train_cv, 
                  eval_set=[(X_eval_cv, y_eval_cv)],
                  verbose=False)
        models.append(model)

    model_dict[target] = models

## CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input","title", "author", "description", "genre", "grade"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        
        if str(type(model))=="<class 'xgboost.sklearn.XGBRegressor'>":
            print("pred xgb. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        elif str(type(model))=="<class 'catboost.core.CatBoostRegressor'>":
            print("pred cat. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        else:
            print("pred lgb")

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

## Create Submission file

In [None]:
test_1 = test[["student_id", "content", "wording"]]
test_1.head()

# 2nd MODEL

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
class CFG:
    model_name = "another-bert"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    
    random_seed=42
    save_steps=20
    max_length=512

In [None]:
MODEL_DIR = '/kaggle/input/commitlit-deberta-v3-large-nofix'
INIT_MODEL = f"{MODEL_DIR}/content/{CFG.model_name}/fold_0"

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker

In [None]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [None]:
summaries_train.head()

In [None]:
import pickle
with open(MODEL_DIR+'/pickled.pkl', 'rb') as f:
    train = pickle.load(f)
    test = pickle.load(f)

# train = preprocessor.run(prompts_train, summaries_train, mode="train")
#test = preprocessor.run(prompts_test, summaries_test, mode="test")

# train.head()

In [None]:
train = train_keep.copy()
test = test_keep.copy()

In [None]:
# train = train[:256]

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

## Model Function Definition

In [None]:
import shutil

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [None]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.model_config = AutoConfig.from_pretrained(INIT_MODEL)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep 
                    + train_df["prompt_question"] + sep 
                    + train_df["text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep 
                    + valid_df["prompt_question"] + sep 
                    + valid_df["text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config,
            ignore_mismatched_sizes=True
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        shutil.rmtree(self.model_dir)
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep 
                    + test_df["prompt_question"] + sep 
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        model_fold_dir = "valid_log" #f"bert-{fold}"
#         print("model_fold_dir", model_fold_dir)
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [None]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"
        
#         print(model_dir, model_name)
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [None]:
for target in ["content", "wording"]:
#     train_by_fold(
#         train,
#         model_name=CFG.model_name,
#         save_each_model=True,
#         target=target,
#         learning_rate=CFG.learning_rate,
#         hidden_dropout_prob=CFG.hidden_dropout_prob,
#         attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#         weight_decay=CFG.weight_decay,
#         num_train_epochs=CFG.num_train_epochs,
#         n_splits=CFG.n_splits,
#         batch_size=CFG.batch_size,
#         save_steps=CFG.save_steps,
#         max_length=CFG.max_length
#     )
    
    print("[validate]")
    train = validate(
        train,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    print("[test]")
    test = predict(
        test,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

In [None]:
# !rm -r wording content

In [None]:
train.head()

In [None]:
train_2nd = train
test_2nd = test

## LGBM model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre", "grade"
               ] + targets

In [None]:
train[train["fold"] != 0].drop(columns=drop_columns)

In [None]:
# model_dict = {}

# for target in targets:
#     models = []
    
#     for fold in range(CFG.n_splits):

#         X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#         y_train_cv = train[train["fold"] != fold][target]

#         X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#         y_eval_cv = train[train["fold"] == fold][target]

#         dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
#         dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

#         params = {
#             'boosting_type': 'gbdt',
#             'random_state': 42,
#             'objective': 'regression',
#             'metric': 'rmse',
#             'learning_rate': 0.048,
#             'max_depth': 4,
#             'lambda_l1': 0.001,
#             'lambda_l2': 0.011
#         }

#         evaluation_results = {}
#         model = lgb.train(params,
#                           num_boost_round=10000,
#                             #categorical_feature = categorical_features,
#                           valid_names=['train', 'valid'],
#                           train_set=dtrain,
#                           valid_sets=dval,
#                           callbacks=[
#                               lgb.early_stopping(stopping_rounds=30, verbose=True),
#                               lgb.log_evaluation(100),
#                               lgb.callback.record_evaluation(evaluation_results)
#                             ],
#                           )
#         models.append(model)
    
#     model_dict[target] = models

In [None]:
from catboost import CatBoostRegressor


model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        model = CatBoostRegressor(
            learning_rate = 0.048,
            depth = 4,
            min_data_in_leaf = 34,
            iterations = 10000,
            early_stopping_rounds = 300,
            task_type ='CPU',
            loss_function ='RMSE'
          )
        model.fit(X_train_cv, 
                  y_train_cv, 
                  eval_set=[(X_eval_cv, y_eval_cv)],
                  verbose=False)
        models.append(model)

    model_dict[target] = models

## CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input","title", "author", "description", "genre", "grade"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        
        if str(type(model))=="<class 'xgboost.sklearn.XGBRegressor'>":
            print("pred xgb. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        elif str(type(model))=="<class 'catboost.core.CatBoostRegressor'>":
            print("pred cat. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        else:
            print("pred lgb")

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

## Create Submission file

In [None]:
test_2 = test[["student_id", "content", "wording"]]
test_2.head()

# 3rd MODEL

In [None]:
from typing import List
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset,load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter
import spacy
import re
from autocorrect import Speller
from spellchecker import SpellChecker
import lightgbm as lgb

warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
disable_progress_bar()
tqdm.pandas()

In [None]:
def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [None]:
class CFG:
    model_name = "another-bert"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=5
    n_splits=4
    batch_size=12
    
    random_seed=42
    save_steps=20
    max_length=512

In [None]:
MODEL_DIR = '/kaggle/input/commitlit-deberta-v3-large-misspellings'
INIT_MODEL = f"{MODEL_DIR}/content/{CFG.model_name}/fold_0"

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

## Preprocess

[Using features]

- Text Length
- Length Ratio
- Word Overlap
- N-grams Co-occurrence
  - count
  - ratio
- Quotes Overlap
- Grammar Check
  - spelling: pyspellchecker

In [None]:
class Preprocessor:
    def __init__(self, 
                model_name: str,
                ) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.twd = TreebankWordDetokenizer()
        self.STOP_WORDS = set(stopwords.words('english'))
        
        self.spacy_ner_model = spacy.load('en_core_web_sm',)
        self.speller = Speller(lang='en')
        self.spellchecker = SpellChecker() 
        
    def word_overlap_count(self, row):
        """ intersection(prompt_text, text) """        
        def check_is_stop_word(word):
            return word in self.STOP_WORDS
        
        prompt_words = row['prompt_tokens']
        summary_words = row['summary_tokens']
        if self.STOP_WORDS:
            prompt_words = list(filter(check_is_stop_word, prompt_words))
            summary_words = list(filter(check_is_stop_word, summary_words))
        return len(set(prompt_words).intersection(set(summary_words)))
            
    def ngrams(self, token, n):
        # Use the zip function to help us generate n-grams
        # Concatentate the tokens into ngrams and return
        ngrams = zip(*[token[i:] for i in range(n)])
        return [" ".join(ngram) for ngram in ngrams]

    def ngram_co_occurrence(self, row, n: int) -> int:
        # Tokenize the original text and summary into words
        original_tokens = row['prompt_tokens']
        summary_tokens = row['summary_tokens']

        # Generate n-grams for the original text and summary
        original_ngrams = set(self.ngrams(original_tokens, n))
        summary_ngrams = set(self.ngrams(summary_tokens, n))

        # Calculate the number of common n-grams
        common_ngrams = original_ngrams.intersection(summary_ngrams)
        return len(common_ngrams)
    
    def ner_overlap_count(self, row, mode:str):
        model = self.spacy_ner_model
        def clean_ners(ner_list):
            return set([(ner[0].lower(), ner[1]) for ner in ner_list])
        prompt = model(row['prompt_text'])
        summary = model(row['text'])

        if "spacy" in str(model):
            prompt_ner = set([(token.text, token.label_) for token in prompt.ents])
            summary_ner = set([(token.text, token.label_) for token in summary.ents])
        elif "stanza" in str(model):
            prompt_ner = set([(token.text, token.type) for token in prompt.ents])
            summary_ner = set([(token.text, token.type) for token in summary.ents])
        else:
            raise Exception("Model not supported")

        prompt_ner = clean_ners(prompt_ner)
        summary_ner = clean_ners(summary_ner)

        intersecting_ners = prompt_ner.intersection(summary_ner)
        
        ner_dict = dict(Counter([ner[1] for ner in intersecting_ners]))
        
        if mode == "train":
            return ner_dict
        elif mode == "test":
            return {key: ner_dict.get(key) for key in self.ner_keys}

    
    def quotes_count(self, row):
        summary = row['text']
        text = row['prompt_text']
        quotes_from_summary = re.findall(r'"([^"]*)"', summary)
        if len(quotes_from_summary)>0:
            return [quote in text for quote in quotes_from_summary].count(True)
        else:
            return 0

    def spelling(self, text):
        
        wordlist=text.split()
        amount_miss = len(list(self.spellchecker.unknown(wordlist)))

        return amount_miss
    
    def add_spelling_dictionary(self, tokens: List[str]) -> List[str]:
        """dictionary update for pyspell checker and autocorrect"""
        self.spellchecker.word_frequency.load_words(tokens)
        self.speller.nlp_data.update({token:1000 for token in tokens})
    
    def run(self, 
            prompts: pd.DataFrame,
            summaries:pd.DataFrame,
            mode:str
        ) -> pd.DataFrame:
        
        # before merge preprocess
        prompts["prompt_length"] = prompts["prompt_text"].apply(
            lambda x: len(word_tokenize(x))
        )
        prompts["prompt_tokens"] = prompts["prompt_text"].apply(
            lambda x: word_tokenize(x)
        )

        summaries["summary_length"] = summaries["text"].apply(
            lambda x: len(word_tokenize(x))
        )
        summaries["summary_tokens"] = summaries["text"].apply(
            lambda x: word_tokenize(x)
        )
        
        # Add prompt tokens into spelling checker dictionary
        prompts["prompt_tokens"].apply(
            lambda x: self.add_spelling_dictionary(x)
        )
        
#         from IPython.core.debugger import Pdb; Pdb().set_trace()
        # fix misspelling
        summaries["fixed_summary_text"] = summaries["text"].progress_apply(
            lambda x: self.speller(x)
        )
        
        # count misspelling
        summaries["splling_err_num"] = summaries["text"].progress_apply(self.spelling)
        
        # merge prompts and summaries
        input_df = summaries.merge(prompts, how="left", on="prompt_id")

        # after merge preprocess
        input_df['length_ratio'] = input_df['summary_length'] / input_df['prompt_length']

        input_df['word_overlap_count'] = input_df.progress_apply(self.word_overlap_count, axis=1)
        input_df['bigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence,args=(2,), axis=1 
        )
        input_df['bigram_overlap_ratio'] = input_df['bigram_overlap_count'] / (input_df['summary_length'] - 1)
        
        input_df['trigram_overlap_count'] = input_df.progress_apply(
            self.ngram_co_occurrence, args=(3,), axis=1
        )
        input_df['trigram_overlap_ratio'] = input_df['trigram_overlap_count'] / (input_df['summary_length'] - 2)
        
        input_df['quotes_count'] = input_df.progress_apply(self.quotes_count, axis=1)
        
        return input_df.drop(columns=["summary_tokens", "prompt_tokens"])
    
preprocessor = Preprocessor(model_name=CFG.model_name)

In [None]:
summaries_train.head()

In [None]:
import pickle
with open(MODEL_DIR+'/pickled.pkl', 'rb') as f:
    train = pickle.load(f)
    test = pickle.load(f)

# train = preprocessor.run(prompts_train, summaries_train, mode="train")
#test = preprocessor.run(prompts_test, summaries_test, mode="test")

# train.head()

In [None]:
train = train_keep.copy()
test = test_keep.copy()

In [None]:
# train = train[:256]

In [None]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i

train.head()

## Model Function Definition

In [None]:
import shutil

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)
    
    return (content_score + wording_score)/2

## Deberta Regressor

In [None]:
class ContentScoreRegressor:
    def __init__(self, 
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "fixed_summary_text"]
        self.input_col = "input"
        
        self.text_cols = [self.input_col] 
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        self.tokenizer = AutoTokenizer.from_pretrained(INIT_MODEL)
        self.model_config = AutoConfig.from_pretrained(INIT_MODEL)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })
        
        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }
    
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized
        
    def train(self, 
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""
        
        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep
                    + train_df["prompt_question"] + sep
                    + train_df['splling_err_num'].astype(str) + " misspellings" + sep
                    + train_df["fixed_summary_text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep
                    + valid_df["prompt_question"] + sep
                    + valid_df['splling_err_num'].astype(str) + " misspellings" + sep
                    + valid_df["fixed_summary_text"]
                  )
        
        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}", 
            config=self.model_config,
            ignore_mismatched_sizes=True
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False) 
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False) 
    
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()
        
        shutil.rmtree(self.model_dir)
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)

        
    def predict(self, 
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""
        
        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep
                    + test_df["prompt_question"] + sep
                    + test_df['splling_err_num'].astype(str) + " misspellings" + sep
                    + test_df["fixed_summary_text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]
    
        test_dataset = Dataset.from_pandas(test_, preserve_index=False) 
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()
        
        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold)) 
        model_fold_dir = "valid_log" #f"bert-{fold}"
#         print("model_fold_dir", model_fold_dir)
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,   
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content, 
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [None]:
def train_by_fold(
        train_df: pd.DataFrame,
        model_name: str,
        target:str,
        save_each_model: bool,
        n_splits: int,
        batch_size: int,
        learning_rate: int,
        hidden_dropout_prob: float,
        attention_probs_dropout_prob: float,
        weight_decay: float,
        num_train_epochs: int,
        save_steps: int,
        max_length:int
    ):

    # delete old model files
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
    
    os.mkdir(model_name)
        
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        train_data = train_df[train_df["fold"] != fold]
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        csr.train(
            fold=fold,
            train_df=train_data,
            valid_df=valid_data, 
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )

def validate(
    train_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ) -> pd.DataFrame:
    """predict oof data"""
    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"
        
#         print(model_dir, model_name)
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=valid_data, 
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred

    return train_df
    
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{MODEL_DIR}/{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{MODEL_DIR}/{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df

In [None]:
for target in ["content", "wording"]:
#     train_by_fold(
#         train,
#         model_name=CFG.model_name,
#         save_each_model=True,
#         target=target,
#         learning_rate=CFG.learning_rate,
#         hidden_dropout_prob=CFG.hidden_dropout_prob,
#         attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
#         weight_decay=CFG.weight_decay,
#         num_train_epochs=CFG.num_train_epochs,
#         n_splits=CFG.n_splits,
#         batch_size=CFG.batch_size,
#         save_steps=CFG.save_steps,
#         max_length=CFG.max_length
#     )
    
    print("[validate]")
    train = validate(
        train,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    print("[test]")
    test = predict(
        test,
        target=target,
        save_each_model=True,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

In [None]:
# !rm -r wording content

In [None]:
train.head()

## LGBM model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre", "grade"
               ] + targets

In [None]:
train[train["fold"] != 0].drop(columns=drop_columns)

In [None]:
# model_dict = {}

# for target in targets:
#     models = []
    
#     for fold in range(CFG.n_splits):

#         X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#         y_train_cv = train[train["fold"] != fold][target]

#         X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#         y_eval_cv = train[train["fold"] == fold][target]

#         dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
#         dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

#         params = {
#             'boosting_type': 'gbdt',
#             'random_state': 42,
#             'objective': 'regression',
#             'metric': 'rmse',
#             'learning_rate': 0.048,
#             'max_depth': 4,
#             'lambda_l1': 0.001,
#             'lambda_l2': 0.011
#         }

#         evaluation_results = {}
#         model = lgb.train(params,
#                           num_boost_round=10000,
#                             #categorical_feature = categorical_features,
#                           valid_names=['train', 'valid'],
#                           train_set=dtrain,
#                           valid_sets=dval,
#                           callbacks=[
#                               lgb.early_stopping(stopping_rounds=30, verbose=True),
#                               lgb.log_evaluation(100),
#                               lgb.callback.record_evaluation(evaluation_results)
#                             ],
#                           )
#         models.append(model)
    
#     model_dict[target] = models

In [None]:
from catboost import CatBoostRegressor


model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        model = CatBoostRegressor(
            learning_rate = 0.048,
            depth = 4,
            min_data_in_leaf = 34,
            iterations = 10000,
            early_stopping_rounds = 300,
            task_type ='CPU',
            loss_function ='RMSE'
          )
        model.fit(X_train_cv, 
                  y_train_cv, 
                  eval_set=[(X_eval_cv, y_eval_cv)],
                  verbose=False)
        models.append(model)

    model_dict[target] = models

## CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input","title", "author", "description", "genre", "grade"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        
        if str(type(model))=="<class 'xgboost.sklearn.XGBRegressor'>":
            print("pred xgb. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        elif str(type(model))=="<class 'catboost.core.CatBoostRegressor'>":
            print("pred cat. rename cols")
            X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
        else:
            print("pred lgb")

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred

    test[target] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

## Create Submission file

In [None]:
test_3 = test[["student_id", "content", "wording"]]
test_3.head()

## ADD 1st pred

In [None]:
sel = ['student_id','prompt_id','content_pred', 'wording_pred']
tmp = train_1st[sel].rename(columns={'content_pred': 'content_pred_1st', 
                          'wording_pred': 'wording_pred_1st'})
tmp.head()

In [None]:
train = train.merge(tmp, on=['student_id','prompt_id'], how="left")
train.head()

In [None]:
test['content_1st'] = test_1st['content']
test['wording_1st'] = test_1st['wording']
test.head()

## Add 2nd pred

In [None]:
sel = ['student_id','prompt_id','content_pred', 'wording_pred']
tmp = train_2nd[sel].rename(columns={'content_pred': 'content_pred_2nd', 
                          'wording_pred': 'wording_pred_2nd'})
tmp.head()

In [None]:
train = train.merge(tmp, on=['student_id','prompt_id'], how="left")
train.head()

In [None]:
test['content_2nd'] = test_2nd['content']
test['wording_2nd'] = test_2nd['wording']
test.head()

# Add Features

In [None]:
wd = pd.read_csv('/kaggle/input/worddifficulty/WordDifficulty.csv')
dic = dict(zip(wd['Word'], wd['I_Zscore']))

In [None]:
def difficulty(data) :
    words = word_tokenize(data['text'])
#     s = ['``','\'\'','.',',']
#     stop_words = set(stopwords.words('english') + s)
#     filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_words = words
    score = 0
    num = 0
    sep = 0
    for e in filtered_words:
        if e in dic:
            score += dic[e]
            num+=1
        elif e == '.' or e == ',' :
            sep+=1
        else:
            pass
#             print(e,"**")

    nn = max(1, len(filtered_words))
    sep = max(1, sep)
    num = max(1, num)
    return score/num, score/nn, nn/sep, (nn-num)/nn



#         prompts["prompt_tokens"] = prompts["prompt_text"].apply(
#             lambda x: word_tokenize(x)
#         )
labels = ['difficulty0', 'difficulty1', 'ave_text_len', 'unknown_words']
train[labels]=train.apply(lambda x:difficulty(x),axis=1, result_type='expand')
test[labels]=test.apply(lambda x:difficulty(x),axis=1, result_type='expand')

In [None]:
from readability import Readability

In [None]:
no_score = -10000
def rscore(data) :
    txt = data['text']
    words = word_tokenize(txt)
    n = len(words) + 1
    if n == 0 : 
        return no_score, no_score, no_score, no_score, no_score, no_score, no_score, no_score
    tot = n
    new = txt
    while tot < 200 :
        new += " " + txt
        tot += n
    r = Readability(new)
    try :
        ret = (r.flesch_kincaid().score, r.flesch().score, r.gunning_fog().score,
               r.coleman_liau().score,r.dale_chall().score, r.ari().score,
               r.linsear_write().score, r.spache().score)
    except:
        return no_score, no_score, no_score, no_score, no_score, no_score, no_score, no_score
    return ret

labels = ['flesch_kincaid', 'flesch', 'gunning_fog', 'coleman_liau',
         'dale_chall','ari','linsear_write','spache']

train[labels]=train.progress_apply(lambda x:rscore(x),axis=1, result_type='expand')
test[labels]=test.progress_apply(lambda x:rscore(x),axis=1, result_type='expand')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk

dic = dict()
for i in range(len(prompts_train)) :
    pid = prompts_train['prompt_id'][i]
    txt = prompts_train['prompt_text'][i]
    original_tokens = nltk.word_tokenize(txt)
    original_text = ' '.join(original_tokens)
    dic[pid] = original_text
for i in range(len(prompts_test)) :
    pid = prompts_test['prompt_id'][i]
    txt = prompts_test['prompt_text'][i]
    original_tokens = nltk.word_tokenize(txt)
    original_text = ' '.join(original_tokens)
    dic[pid] = original_text
    
dic.keys()

def cosine_sim(data):
    original_text = dic[data['prompt_id']]
    summary = data['fixed_summary_text']
#     original_tokens = nltk.word_tokenize(original_text)
    summary_tokens = nltk.word_tokenize(summary)

    # トークンを結合して文に戻す
#     original_text = ' '.join(original_tokens)
    summary = ' '.join(summary_tokens)

    # CountVectorizerを使用して文をベクトル化
    vectorizer = CountVectorizer().fit_transform([original_text, summary])

    # コサイン類似度を計算
    cosine_scores = cosine_similarity(vectorizer)

    # 要約と元の文章の類似度を表示
    similarity_score = cosine_scores[0][1]
    
    return similarity_score

In [None]:
# cosine_sim(train.iloc[0])
train['cos_sim']=train.progress_apply(lambda x:cosine_sim(x),axis=1, result_type='expand')
test['cos_sim']=test.progress_apply(lambda x:cosine_sim(x),axis=1, result_type='expand')

In [None]:
import textstat
def txts(data):
    text = data['text']
    return (
        textstat.flesch_reading_ease(text),
        textstat.flesch_kincaid_grade(text),
        textstat.gunning_fog(text),
        textstat.smog_index(text),
        textstat.automated_readability_index(text),
        textstat.coleman_liau_index(text),
        textstat.linsear_write_formula(text),
        textstat.dale_chall_readability_score(text),
        textstat.text_standard(text, float_output=True),
        textstat.reading_time(text, ms_per_char=14.69),
        textstat.syllable_count(text),
        textstat.lexicon_count(text, removepunct=True),
        textstat.sentence_count(text),
        textstat.char_count(text, ignore_spaces=True),
        textstat.letter_count(text, ignore_spaces=True),
        textstat.monosyllabcount(text)
    )

sample = txts(train.iloc[1])
print(sample)
labels = [f'f{i}' for i in range(len(sample))]
train[labels]=train.progress_apply(lambda x:txts(x),axis=1, result_type='expand')
test[labels]=test.progress_apply(lambda x:txts(x),axis=1, result_type='expand')

In [None]:
train

In [None]:
train.to_csv("train.csv", index=False)

# LGBM model

In [None]:
targets = ["content", "wording"]

drop_columns = ["fold", "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text","title", "author", "description", "genre", "grade"
               ] + targets

In [None]:
train[train["fold"] != 0].drop(columns=drop_columns)

In [None]:
# import optuna
# def objective(trial):
    
#     model_dict = {}

#     for target in targets:
#         models = []

#         for fold in range(CFG.n_splits):

#             X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#             y_train_cv = train[train["fold"] != fold][target]

#             X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#             y_eval_cv = train[train["fold"] == fold][target]

#             dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
#             dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

#             params = {
#                 'boosting_type': 'gbdt',
#                 'random_state': 42,
#                 'objective': 'regression',
#                 'metric': 'rmse',
#                 'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.2),
#                 'max_depth': trial.suggest_int('max_depth', 3, 10),
#                 'lambda_l1': trial.suggest_uniform('lambda_l1', 0, 0.01),
#                 'lambda_l2': trial.suggest_uniform('lambda_l2', 0, 0.10),
#                 'num_leaves': trial.suggest_int('num_leaves', 16, 64),
#                 'verbosity': -1
#             }

#             evaluation_results = {}
#             model = lgb.train(params,
#                               num_boost_round=10000,
#                                 #categorical_feature = categorical_features,
#                               valid_names=['train', 'valid'],
#                               train_set=dtrain,
#                               valid_sets=dval,
#                               callbacks=[
#                                   lgb.early_stopping(stopping_rounds=30, verbose=False),
# #                                   lgb.log_evaluation(100),
# #                                   lgb.callback.record_evaluation(evaluation_results)
#                                 ],
#                               )
#             models.append(model)

#         model_dict[target] = models

#         rmses = []

#     for target in targets:
#         models = model_dict[target]

#         preds = []
#         trues = []

#         for fold, model in enumerate(models):
#             X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#             y_eval_cv = train[train["fold"] == fold][target]

#             pred = model.predict(X_eval_cv)

#             trues.extend(y_eval_cv)
#             preds.extend(pred)

#         rmse = np.sqrt(mean_squared_error(trues, preds))
# #         print(f"{target}_rmse : {rmse}")
#         rmses = rmses + [rmse]

#     print(f"mcrmse : {sum(rmses) / len(rmses)}")
#     return sum(rmses) / len(rmses)
        

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=30)

# print('Number of finished trials:', len(study.trials))
# print('Best trial:', study.best_trial.params)

In [None]:
model_dict = {}

for target in targets:
    models = []
    
    for fold in range(CFG.n_splits):

        X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
        y_train_cv = train[train["fold"] != fold][target]

        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        dtrain = lgb.Dataset(X_train_cv, label=y_train_cv)
        dval = lgb.Dataset(X_eval_cv, label=y_eval_cv)

        params = {
            'boosting_type': 'gbdt',
            'random_state': 42,
            'objective': 'regression',
            'metric': 'rmse',
            'learning_rate': 0.048,
            'max_depth': 4,
            'lambda_l1': 0.001,
            'lambda_l2': 0.011
        }

        evaluation_results = {}
        model = lgb.train(params,
                          num_boost_round=10000,
                            #categorical_feature = categorical_features,
                          valid_names=['train', 'valid'],
                          train_set=dtrain,
                          valid_sets=dval,
                          callbacks=[
                              lgb.early_stopping(stopping_rounds=30, verbose=True),
                              lgb.log_evaluation(100),
                              lgb.callback.record_evaluation(evaluation_results)
                            ],
                          )
        models.append(model)
    
    model_dict[target] = models

## Catboost

In [None]:
# from catboost import CatBoostRegressor


# model_dict = {}

# for target in targets:
#     models = []
    
#     for fold in range(CFG.n_splits):

#         X_train_cv = train[train["fold"] != fold].drop(columns=drop_columns)
#         y_train_cv = train[train["fold"] != fold][target]

#         X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
#         y_eval_cv = train[train["fold"] == fold][target]

#         model = CatBoostRegressor(
#             learning_rate = 0.048,
#             depth = 4,
#             min_data_in_leaf = 34,
#             iterations = 10000,
#             early_stopping_rounds = 300,
#             task_type ='CPU',
#             loss_function ='RMSE'
#           )
#         model.fit(X_train_cv, 
#                   y_train_cv, 
#                   eval_set=[(X_eval_cv, y_eval_cv)],
#                   verbose=False)
#         models.append(model)

#     model_dict[target] = models

## CV Score

In [None]:
# cv
rmses = []

for target in targets:
    models = model_dict[target]

    preds = []
    trues = []
    
    for fold, model in enumerate(models):
        X_eval_cv = train[train["fold"] == fold].drop(columns=drop_columns)
        y_eval_cv = train[train["fold"] == fold][target]

        pred = model.predict(X_eval_cv)

        trues.extend(y_eval_cv)
        preds.extend(pred)
        
    rmse = np.sqrt(mean_squared_error(trues, preds))
    print(f"{target}_rmse : {rmse}")
    rmses = rmses + [rmse]

print(f"mcrmse : {sum(rmses) / len(rmses)}")

## Predict

In [None]:
drop_columns = [
                #"fold", 
                "student_id", "prompt_id", "text", "fixed_summary_text",
                "prompt_question", "prompt_title", 
                "prompt_text",
                "input","title", "author", "description", "genre", "grade"
               ] + [
                f"content_pred_{i}" for i in range(CFG.n_splits)
                ] + [
                f"wording_pred_{i}" for i in range(CFG.n_splits)
                ]

In [None]:
pred_dict = {}
for target in targets:
    models = model_dict[target]
    preds = []

    for fold, model in enumerate(models):
        X_eval_cv = test.drop(columns=drop_columns)
        
#         if str(type(model))=="<class 'xgboost.sklearn.XGBRegressor'>":
#             print("pred xgb. rename cols")
#             X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
#         elif str(type(model))=="<class 'catboost.core.CatBoostRegressor'>":
#             print("pred cat. rename cols")
#             X_eval_cv = X_eval_cv.rename({'content':'content_pred','wording':'wording_pred'},axis=1)
#         else:
#             print("pred lgb")

        pred = model.predict(X_eval_cv)
        preds.append(pred)
    
    pred_dict[target] = preds

In [None]:
for target in targets:
    preds = pred_dict[target]
    for i, pred in enumerate(preds):
        test[f"{target}_pred_{i}"] = pred
        
    test[f"{target}_lgbm"] = test[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

In [None]:
test

## Create Submission file

In [None]:
test[["student_id", "content", "wording"]]

In [None]:
x = [-0.05653549,  0.28765139,  0.51956897,  0.29386971, 0]
test['content'] =\
    test['content']*x[0]+\
    test_1st['content']*x[1]+\
    test_2nd['content']*x[2]+\
    test_zero['content']*x[3]+\
    test['content_lgbm']*x[4]

In [None]:
x = [0.25870747, 0.0424712 , 0.30386073, 0.39686579, 0]
test['wording'] =\
    test['wording']*x[0]+\
    test_1st['content']*x[1]+\
    test_2nd['content']*x[2]+\
    test_zero['content']*x[3]+\
    test['wording_lgbm']*x[4]

In [None]:
sample_submission

In [None]:
test[["student_id", "content", "wording"]]

In [None]:
# test['content'] = (((test_1['content'] + test_2['content'] + test_3['content'])/3)+test['content'])/2
# test['wording'] = (((test_1['wording'] + test_2['wording'] + test_3['wording'])/3)+test['wording'])/2

In [None]:
# test[["student_id", "content", "wording"]]

In [None]:
# test['content'] = test['content']*0.6 + test_zero['content']*0.4
# test['wording'] = test['wording']*0.6 + test_zero['wording']*0.4

In [None]:
# test[["student_id", "content", "wording"]]

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

## Summary

CV result is like this.

| | content rmse |wording rmse | mcrmse | LB| |
| -- | -- | -- | -- | -- | -- |
|baseline| 0.494 | 0.630 | 0.562 | 0.509 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-baseline-content-and-wording-models)|
| use title and question field | 0.476| 0.619 | 0.548 | 0.508 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-w-prompt-title-question-fields) |
| Debertav3 + LGBM | 0.451 | 0.591 | 0.521 | 0.461 | [link](https://www.kaggle.com/code/tsunotsuno/debertav3-lgbm-with-feature-engineering) |
| Debertav3 + LGBM with spell autocorrect | 0.448 | 0.581 | 0.514 | 0.459 |nogawanogawa's original code
| Debertav3 + LGBM with spell autocorrect and tuning | 0.442 | 0.566 | 0.504 | 0.453 | this notebook |

The CV values improved slightly, and the LB value is improved.